mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-28 03:42:45 +00:00
Compare commits
28 Commits
fix-drop-s
...
feature-ar
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5248838ca | ||
|
|
6eb6e352da | ||
|
|
d89a86643d | ||
|
|
84ab36ba70 | ||
|
|
de97eea3e2 | ||
|
|
68322376f2 | ||
|
|
2729b0d3dc | ||
|
|
07c0ed5e26 | ||
|
|
d684452588 | ||
|
|
e00658375b | ||
|
|
a0cf673f1b | ||
|
|
300432ae05 | ||
|
|
6ba1b726be | ||
|
|
38d2abb982 | ||
|
|
cd653959d6 | ||
|
|
338cadf284 | ||
|
|
9383471fa0 | ||
|
|
0060b46c8b | ||
|
|
b153ec803b | ||
|
|
38dba60ceb | ||
|
|
ae0474450f | ||
|
|
8efb01010c | ||
|
|
d18bbfa9c3 | ||
|
|
ec76d3c762 | ||
|
|
bdc0a58242 | ||
|
|
b049ad9626 | ||
|
|
79def8a200 | ||
|
|
701735f6e5 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -111,3 +111,4 @@ celerybeat-schedule*
|
||||
|
||||
# ignore pnpm package store folder created when setting up the devcontainer
|
||||
.pnpm-store/
|
||||
.worktrees
|
||||
|
||||
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
|
||||
@@ -801,11 +801,14 @@ parsing documents.
|
||||
|
||||
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
||||
|
||||
: Tell paperless when and how to perform ocr on your documents. Three
|
||||
: Tell paperless when and how to perform ocr on your documents. Four
|
||||
modes are available:
|
||||
|
||||
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||
pages where no text is present. This is the safest option.
|
||||
- `auto` (default): Paperless detects whether a document already
|
||||
has embedded text via pdftotext. If sufficient text is found,
|
||||
OCR is skipped for that document (`--skip-text`). If no text is
|
||||
present, OCR runs normally. This is the safest option for mixed
|
||||
document collections.
|
||||
|
||||
- `redo`: Paperless will OCR all pages of your documents and
|
||||
attempt to replace any existing text layers with new text. This
|
||||
@@ -823,24 +826,58 @@ modes are available:
|
||||
significantly larger and text won't appear as sharp when zoomed
|
||||
in.
|
||||
|
||||
The default is `skip`, which only performs OCR when necessary and
|
||||
always creates archived documents.
|
||||
- `off`: Paperless never invokes the OCR engine. For PDFs, text
|
||||
is extracted via pdftotext only. For image documents, text will
|
||||
be empty. Archive file generation still works via format
|
||||
conversion (no Tesseract or Ghostscript required).
|
||||
|
||||
The default is `auto`.
|
||||
|
||||
Read more about this in the [OCRmyPDF
|
||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||
|
||||
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
|
||||
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
|
||||
|
||||
: Specify when you would like paperless to skip creating an archived
|
||||
version of your documents. This is useful if you don't want to have two
|
||||
almost-identical versions of your documents in the media folder.
|
||||
: Controls when paperless creates a PDF/A archive version of your
|
||||
documents. Archive files are stored alongside the original and are used
|
||||
for display in the web interface.
|
||||
|
||||
- `never`: Never skip creating an archived version.
|
||||
- `with_text`: Skip creating an archived version for documents
|
||||
that already have embedded text.
|
||||
- `always`: Always skip creating an archived version.
|
||||
- `auto` (default): Produce archives for scanned or image-based
|
||||
documents. Skip archive generation for born-digital PDFs that
|
||||
already contain embedded text. This is the recommended setting
|
||||
for mixed document collections.
|
||||
- `always`: Always produce a PDF/A archive when the parser
|
||||
supports it, regardless of whether the document already has
|
||||
text.
|
||||
- `never`: Never produce an archive. Only the original file is
|
||||
stored. Saves disk space but the web viewer will display the
|
||||
original file directly.
|
||||
|
||||
The default is `never`.
|
||||
**Behaviour by file type and mode** (`auto` column shows the default):
|
||||
|
||||
| Document type | `never` | `auto` (default) | `always` |
|
||||
| -------------------------- | ------- | -------------------------- | -------- |
|
||||
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
|
||||
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
|
||||
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
|
||||
| Plain text, email, HTML | No | No | No |
|
||||
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |
|
||||
|
||||
\* Tika always produces a PDF rendition for display; this counts as
|
||||
the archive regardless of the setting.
|
||||
|
||||
!!! note
|
||||
|
||||
This setting applies to the built-in Tesseract parser. Parsers
|
||||
that must always convert documents to PDF for display (e.g. DOCX,
|
||||
ODT via Tika) will produce a PDF regardless of this setting.
|
||||
|
||||
!!! note
|
||||
|
||||
The **remote OCR parser** (Azure AI) always produces a searchable
|
||||
PDF and stores it as the archive copy, regardless of this setting.
|
||||
`ARCHIVE_FILE_GENERATION=never` has no effect when the remote
|
||||
parser handles a document.
|
||||
|
||||
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
||||
|
||||
|
||||
@@ -104,6 +104,65 @@ Multiple options are combined in a single value:
|
||||
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
||||
```
|
||||
|
||||
## OCR and Archive File Generation Settings
|
||||
|
||||
The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — there are no migration shims.
|
||||
|
||||
### Removed settings
|
||||
|
||||
| Removed Setting | Replacement |
|
||||
| ------------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) |
|
||||
| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
|
||||
### What changed and why
|
||||
|
||||
Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives.
|
||||
|
||||
The new settings are independent:
|
||||
|
||||
- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`.
|
||||
- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`.
|
||||
|
||||
### Action Required
|
||||
|
||||
Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly:
|
||||
|
||||
```bash
|
||||
# v2: skip OCR when text present, always archive
|
||||
PAPERLESS_OCR_MODE=skip
|
||||
# v3: equivalent (auto is the new default)
|
||||
# No change needed — auto is the default
|
||||
|
||||
# v2: skip OCR when text present, skip archive too
|
||||
PAPERLESS_OCR_MODE=skip_noarchive
|
||||
# v3: equivalent
|
||||
PAPERLESS_OCR_MODE=auto
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: always skip archive
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always
|
||||
# v3: equivalent
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: skip archive only for born-digital docs
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text
|
||||
# v3: equivalent (auto is the new default)
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=auto
|
||||
```
|
||||
|
||||
Paperless will emit a startup warning if the old environment variables are still set.
|
||||
|
||||
### Remote OCR parser
|
||||
|
||||
If you use the **remote OCR parser** (Azure AI), note that it always produces a
|
||||
searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never`
|
||||
has no effect for documents handled by the remote parser — the archive is produced
|
||||
unconditionally by the remote engine.
|
||||
|
||||
## OpenID Connect Token Endpoint Authentication
|
||||
|
||||
Some existing OpenID Connect setups may require an explicit token endpoint authentication method after upgrading to v3.
|
||||
|
||||
@@ -633,12 +633,11 @@ hardware, but a few settings can improve performance:
|
||||
consumption, so you might want to lower these settings (example: 2
|
||||
workers and 1 thread to always have some computing power left for
|
||||
other tasks).
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider
|
||||
OCRing your documents before feeding them into Paperless. Some
|
||||
scanners are able to do this!
|
||||
- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive
|
||||
file generation for already OCRed documents, or `always` to skip it
|
||||
for all documents.
|
||||
- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive
|
||||
file generation entirely, saving disk space at the cost of in-browser PDF/A viewing.
|
||||
- If you want to perform OCR on the device, consider using
|
||||
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
||||
less memory at the expense of slightly worse OCR results.
|
||||
|
||||
@@ -269,10 +269,6 @@ testpaths = [
|
||||
"src/documents/tests/",
|
||||
"src/paperless/tests/",
|
||||
"src/paperless_mail/tests/",
|
||||
"src/paperless_tesseract/tests/",
|
||||
"src/paperless_tika/tests",
|
||||
"src/paperless_text/tests/",
|
||||
"src/paperless_remote/tests/",
|
||||
"src/paperless_ai/tests",
|
||||
]
|
||||
|
||||
|
||||
@@ -468,7 +468,7 @@
|
||||
"time": 0.951,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__in=9",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__in=9",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -534,7 +534,7 @@
|
||||
"time": 0.653,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=10&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=9",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=9",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
|
||||
@@ -883,7 +883,7 @@
|
||||
"time": 0.93,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=4",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=4",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
@@ -961,7 +961,7 @@
|
||||
"time": -1,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=4",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=4",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
|
||||
@@ -16,7 +16,7 @@ test('basic filtering', async ({ page }) => {
|
||||
await expect(page).toHaveURL(/tags__id__all=9/)
|
||||
await expect(page.locator('pngx-document-list')).toHaveText(/8 documents/)
|
||||
await page.getByRole('button', { name: 'Document type' }).click()
|
||||
await page.getByRole('menuitem', { name: /^Invoice Test/ }).click()
|
||||
await page.getByRole('menuitem', { name: 'Invoice Test 3' }).click()
|
||||
await expect(page).toHaveURL(/document_type__id__in=1/)
|
||||
await expect(page.locator('pngx-document-list')).toHaveText(/3 documents/)
|
||||
await page.getByRole('button', { name: 'Reset filters' }).first().click()
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -297,11 +297,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">87</context>
|
||||
<context context-type="linenumber">88</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">89</context>
|
||||
<context context-type="linenumber">90</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/dashboard/dashboard.component.html</context>
|
||||
@@ -324,11 +324,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">94</context>
|
||||
<context context-type="linenumber">95</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">96</context>
|
||||
<context context-type="linenumber">97</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/common/share-link-bundle-dialog/share-link-bundle-dialog.component.html</context>
|
||||
@@ -375,15 +375,15 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">54</context>
|
||||
<context context-type="linenumber">55</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">273</context>
|
||||
<context context-type="linenumber">274</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">275</context>
|
||||
<context context-type="linenumber">276</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="5890330709052835856" datatype="html">
|
||||
@@ -728,11 +728,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">308</context>
|
||||
<context context-type="linenumber">309</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">311</context>
|
||||
<context context-type="linenumber">312</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="2272120016352772836" datatype="html">
|
||||
@@ -1139,11 +1139,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">233</context>
|
||||
<context context-type="linenumber">234</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">235</context>
|
||||
<context context-type="linenumber">236</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
|
||||
@@ -1700,7 +1700,7 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">204</context>
|
||||
<context context-type="linenumber">205</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.ts</context>
|
||||
@@ -1782,15 +1782,15 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">156</context>
|
||||
<context context-type="linenumber">164</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">230</context>
|
||||
<context context-type="linenumber">238</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">255</context>
|
||||
<context context-type="linenumber">263</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="2991443309752293110" datatype="html">
|
||||
@@ -1801,11 +1801,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">296</context>
|
||||
<context context-type="linenumber">297</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">298</context>
|
||||
<context context-type="linenumber">299</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="103921551219467537" datatype="html">
|
||||
@@ -2224,11 +2224,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">256</context>
|
||||
<context context-type="linenumber">257</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">259</context>
|
||||
<context context-type="linenumber">260</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3818027200170621545" datatype="html">
|
||||
@@ -2581,11 +2581,11 @@
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">287</context>
|
||||
<context context-type="linenumber">288</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">289</context>
|
||||
<context context-type="linenumber">290</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="4569276013106377105" datatype="html">
|
||||
@@ -2897,90 +2897,90 @@
|
||||
<source>Logged in as <x id="INTERPOLATION" equiv-text="{{this.settingsService.displayName}}"/></source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">46</context>
|
||||
<context context-type="linenumber">47</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="2127032578120864096" datatype="html">
|
||||
<source>My Profile</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">50</context>
|
||||
<context context-type="linenumber">51</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3797778920049399855" datatype="html">
|
||||
<source>Logout</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">57</context>
|
||||
<context context-type="linenumber">58</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="4895326106573044490" datatype="html">
|
||||
<source>Documentation</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">62</context>
|
||||
<context context-type="linenumber">63</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">317</context>
|
||||
<context context-type="linenumber">318</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">320</context>
|
||||
<context context-type="linenumber">321</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="472206565520537964" datatype="html">
|
||||
<source>Saved views</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">104</context>
|
||||
<context context-type="linenumber">105</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">134</context>
|
||||
<context context-type="linenumber">135</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="6988090220128974198" datatype="html">
|
||||
<source>Open documents</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">143</context>
|
||||
<context context-type="linenumber">144</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="5687256342387781369" datatype="html">
|
||||
<source>Close all</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">163</context>
|
||||
<context context-type="linenumber">164</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">165</context>
|
||||
<context context-type="linenumber">166</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3897348120591552265" datatype="html">
|
||||
<source>Manage</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">174</context>
|
||||
<context context-type="linenumber">175</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="8008131619909556709" datatype="html">
|
||||
<source>Attributes</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">181</context>
|
||||
<context context-type="linenumber">182</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">183</context>
|
||||
<context context-type="linenumber">184</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="7437910965833684826" datatype="html">
|
||||
<source>Correspondents</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">209</context>
|
||||
<context context-type="linenumber">210</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/dashboard/widgets/statistics-widget/statistics-widget.component.html</context>
|
||||
@@ -2995,7 +2995,7 @@
|
||||
<source>Document types</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">214</context>
|
||||
<context context-type="linenumber">215</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/manage/document-attributes/document-attributes.component.ts</context>
|
||||
@@ -3006,7 +3006,7 @@
|
||||
<source>Storage paths</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">219</context>
|
||||
<context context-type="linenumber">220</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/manage/document-attributes/document-attributes.component.ts</context>
|
||||
@@ -3017,7 +3017,7 @@
|
||||
<source>Custom fields</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">224</context>
|
||||
<context context-type="linenumber">225</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/document-list/bulk-editor/bulk-editor.component.html</context>
|
||||
@@ -3040,11 +3040,11 @@
|
||||
<source>Workflows</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">242</context>
|
||||
<context context-type="linenumber">243</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">244</context>
|
||||
<context context-type="linenumber">245</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/manage/workflows/workflows.component.html</context>
|
||||
@@ -3055,92 +3055,92 @@
|
||||
<source>Mail</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">249</context>
|
||||
<context context-type="linenumber">250</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">252</context>
|
||||
<context context-type="linenumber">253</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="7844706011418789951" datatype="html">
|
||||
<source>Administration</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">267</context>
|
||||
<context context-type="linenumber">268</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3008420115644088420" datatype="html">
|
||||
<source>Configuration</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">280</context>
|
||||
<context context-type="linenumber">281</context>
|
||||
</context-group>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">282</context>
|
||||
<context context-type="linenumber">283</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="1534029177398918729" datatype="html">
|
||||
<source>GitHub</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">327</context>
|
||||
<context context-type="linenumber">328</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="4112664765954374539" datatype="html">
|
||||
<source>is available.</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">336,337</context>
|
||||
<context context-type="linenumber">337,338</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="1175891574282637937" datatype="html">
|
||||
<source>Click to view.</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">337</context>
|
||||
<context context-type="linenumber">338</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="9811291095862612" datatype="html">
|
||||
<source>Paperless-ngx can automatically check for updates</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">341</context>
|
||||
<context context-type="linenumber">342</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="894819944961861800" datatype="html">
|
||||
<source> How does this work? </source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">348,350</context>
|
||||
<context context-type="linenumber">349,351</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="509090351011426949" datatype="html">
|
||||
<source>Update available</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.html</context>
|
||||
<context context-type="linenumber">361</context>
|
||||
<context context-type="linenumber">362</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="1542489069631984294" datatype="html">
|
||||
<source>Sidebar views updated</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">343</context>
|
||||
<context context-type="linenumber">383</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3547923076537026828" datatype="html">
|
||||
<source>Error updating sidebar views</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">346</context>
|
||||
<context context-type="linenumber">386</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="2526035785704676448" datatype="html">
|
||||
<source>An error occurred while saving update checking settings.</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/components/app-frame/app-frame.component.ts</context>
|
||||
<context context-type="linenumber">367</context>
|
||||
<context context-type="linenumber">407</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="4580988005648117665" datatype="html">
|
||||
@@ -11187,21 +11187,21 @@
|
||||
<source>Successfully completed one-time migratration of settings to the database!</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/services/settings.service.ts</context>
|
||||
<context context-type="linenumber">609</context>
|
||||
<context context-type="linenumber">635</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="5558341108007064934" datatype="html">
|
||||
<source>Unable to migrate settings to the database, please try saving manually.</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/services/settings.service.ts</context>
|
||||
<context context-type="linenumber">610</context>
|
||||
<context context-type="linenumber">636</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="1168781785897678748" datatype="html">
|
||||
<source>You can restart the tour from the settings page.</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/services/settings.service.ts</context>
|
||||
<context context-type="linenumber">683</context>
|
||||
<context context-type="linenumber">708</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="3852289441366561594" datatype="html">
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<nav class="navbar navbar-dark fixed-top bg-primary flex-md-nowrap p-0 shadow-sm">
|
||||
<button class="navbar-toggler d-md-none collapsed border-0" type="button" data-toggle="collapse"
|
||||
data-target="#sidebarMenu" aria-controls="sidebarMenu" aria-expanded="false" aria-label="Toggle navigation"
|
||||
(click)="isMenuCollapsed = !isMenuCollapsed">
|
||||
(click)="mobileSearchHidden = false; isMenuCollapsed = !isMenuCollapsed">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<a class="navbar-brand d-flex align-items-center me-0 px-3 py-3 order-sm-0"
|
||||
@@ -24,7 +24,8 @@
|
||||
}
|
||||
</div>
|
||||
</a>
|
||||
<div class="search-container flex-grow-1 py-2 pb-3 pb-sm-2 px-3 ps-md-4 me-sm-auto order-3 order-sm-1">
|
||||
<div class="search-container flex-grow-1 py-2 pb-3 pb-sm-2 px-3 ps-md-4 me-sm-auto order-3 order-sm-1"
|
||||
[class.mobile-hidden]="mobileSearchHidden">
|
||||
<div class="col-12 col-md-7">
|
||||
<pngx-global-search></pngx-global-search>
|
||||
</div>
|
||||
@@ -378,7 +379,7 @@
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<main role="main" class="ms-sm-auto px-md-4"
|
||||
<main role="main" class="ms-sm-auto px-md-4" [class.mobile-search-hidden]="mobileSearchHidden"
|
||||
[ngClass]="slimSidebarEnabled ? 'col-slim' : 'col-md-9 col-lg-10 col-xxxl-11'">
|
||||
<router-outlet></router-outlet>
|
||||
</main>
|
||||
|
||||
@@ -44,6 +44,23 @@
|
||||
.sidebar {
|
||||
top: 3.5rem;
|
||||
}
|
||||
|
||||
.search-container {
|
||||
max-height: 4.5rem;
|
||||
overflow: hidden;
|
||||
transition: max-height .2s ease, opacity .2s ease, padding-top .2s ease, padding-bottom .2s ease;
|
||||
|
||||
&.mobile-hidden {
|
||||
max-height: 0;
|
||||
opacity: 0;
|
||||
padding-top: 0 !important;
|
||||
padding-bottom: 0 !important;
|
||||
}
|
||||
}
|
||||
|
||||
main.mobile-search-hidden {
|
||||
padding-top: 56px;
|
||||
}
|
||||
}
|
||||
|
||||
main {
|
||||
|
||||
@@ -293,6 +293,59 @@ describe('AppFrameComponent', () => {
|
||||
expect(component.isMenuCollapsed).toBeTruthy()
|
||||
})
|
||||
|
||||
it('should hide mobile search when scrolling down and show it when scrolling up', () => {
|
||||
Object.defineProperty(globalThis, 'innerWidth', {
|
||||
value: 767,
|
||||
})
|
||||
|
||||
component.ngOnInit()
|
||||
|
||||
Object.defineProperty(globalThis, 'scrollY', {
|
||||
configurable: true,
|
||||
value: 40,
|
||||
})
|
||||
component.onWindowScroll()
|
||||
expect(component.mobileSearchHidden).toBe(true)
|
||||
|
||||
Object.defineProperty(globalThis, 'scrollY', {
|
||||
configurable: true,
|
||||
value: 0,
|
||||
})
|
||||
component.onWindowScroll()
|
||||
expect(component.mobileSearchHidden).toBe(false)
|
||||
})
|
||||
|
||||
it('should keep mobile search visible on desktop scroll or resize', () => {
|
||||
Object.defineProperty(globalThis, 'innerWidth', {
|
||||
value: 1024,
|
||||
})
|
||||
component.ngOnInit()
|
||||
component.mobileSearchHidden = true
|
||||
|
||||
component.onWindowScroll()
|
||||
|
||||
expect(component.mobileSearchHidden).toBe(false)
|
||||
|
||||
component.mobileSearchHidden = true
|
||||
component.onWindowResize()
|
||||
})
|
||||
|
||||
it('should keep mobile search visible while the mobile menu is expanded', () => {
|
||||
Object.defineProperty(globalThis, 'innerWidth', {
|
||||
value: 767,
|
||||
})
|
||||
component.ngOnInit()
|
||||
component.isMenuCollapsed = false
|
||||
|
||||
Object.defineProperty(globalThis, 'scrollY', {
|
||||
configurable: true,
|
||||
value: 40,
|
||||
})
|
||||
component.onWindowScroll()
|
||||
|
||||
expect(component.mobileSearchHidden).toBe(false)
|
||||
})
|
||||
|
||||
it('should support close document & navigate on close current doc', () => {
|
||||
const closeSpy = jest.spyOn(openDocumentsService, 'closeDocument')
|
||||
closeSpy.mockReturnValue(of(true))
|
||||
|
||||
@@ -51,6 +51,8 @@ import { ComponentWithPermissions } from '../with-permissions/with-permissions.c
|
||||
import { GlobalSearchComponent } from './global-search/global-search.component'
|
||||
import { ToastsDropdownComponent } from './toasts-dropdown/toasts-dropdown.component'
|
||||
|
||||
const SCROLL_THRESHOLD = 16
|
||||
|
||||
@Component({
|
||||
selector: 'pngx-app-frame',
|
||||
templateUrl: './app-frame.component.html',
|
||||
@@ -94,6 +96,10 @@ export class AppFrameComponent
|
||||
|
||||
slimSidebarAnimating: boolean = false
|
||||
|
||||
public mobileSearchHidden: boolean = false
|
||||
|
||||
private lastScrollY: number = 0
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
const permissionsService = this.permissionsService
|
||||
@@ -111,6 +117,8 @@ export class AppFrameComponent
|
||||
}
|
||||
|
||||
ngOnInit(): void {
|
||||
this.lastScrollY = window.scrollY
|
||||
|
||||
if (this.settingsService.get(SETTINGS_KEYS.UPDATE_CHECKING_ENABLED)) {
|
||||
this.checkForUpdates()
|
||||
}
|
||||
@@ -263,6 +271,38 @@ export class AppFrameComponent
|
||||
return this.settingsService.get(SETTINGS_KEYS.AI_ENABLED)
|
||||
}
|
||||
|
||||
@HostListener('window:resize')
|
||||
onWindowResize(): void {
|
||||
if (!this.isMobileViewport()) {
|
||||
this.mobileSearchHidden = false
|
||||
}
|
||||
}
|
||||
|
||||
@HostListener('window:scroll')
|
||||
onWindowScroll(): void {
|
||||
const currentScrollY = window.scrollY
|
||||
|
||||
if (!this.isMobileViewport() || this.isMenuCollapsed === false) {
|
||||
this.mobileSearchHidden = false
|
||||
this.lastScrollY = currentScrollY
|
||||
return
|
||||
}
|
||||
|
||||
const delta = currentScrollY - this.lastScrollY
|
||||
|
||||
if (currentScrollY <= 0 || delta < -SCROLL_THRESHOLD) {
|
||||
this.mobileSearchHidden = false
|
||||
} else if (currentScrollY > SCROLL_THRESHOLD && delta > SCROLL_THRESHOLD) {
|
||||
this.mobileSearchHidden = true
|
||||
}
|
||||
|
||||
this.lastScrollY = currentScrollY
|
||||
}
|
||||
|
||||
private isMobileViewport(): boolean {
|
||||
return window.innerWidth < 768
|
||||
}
|
||||
|
||||
closeMenu() {
|
||||
this.isMenuCollapsed = true
|
||||
}
|
||||
|
||||
@@ -20,9 +20,9 @@ import { Subject, filter, takeUntil } from 'rxjs'
|
||||
import { NEGATIVE_NULL_FILTER_VALUE } from 'src/app/data/filter-rule-type'
|
||||
import { MatchingModel } from 'src/app/data/matching-model'
|
||||
import { ObjectWithPermissions } from 'src/app/data/object-with-permissions'
|
||||
import { SelectionDataItem } from 'src/app/data/results'
|
||||
import { FilterPipe } from 'src/app/pipes/filter.pipe'
|
||||
import { HotKeyService } from 'src/app/services/hot-key.service'
|
||||
import { SelectionDataItem } from 'src/app/services/rest/document.service'
|
||||
import { pngxPopperOptions } from 'src/app/utils/popper-options'
|
||||
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
|
||||
import { ClearableBadgeComponent } from '../clearable-badge/clearable-badge.component'
|
||||
|
||||
@@ -300,7 +300,7 @@ describe('BulkEditorComponent', () => {
|
||||
parameters: { add_tags: [101], remove_tags: [] },
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -332,7 +332,7 @@ describe('BulkEditorComponent', () => {
|
||||
.expectOne(`${environment.apiBaseUrl}documents/bulk_edit/`)
|
||||
.flush(true)
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -423,7 +423,7 @@ describe('BulkEditorComponent', () => {
|
||||
parameters: { correspondent: 101 },
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -455,7 +455,7 @@ describe('BulkEditorComponent', () => {
|
||||
.expectOne(`${environment.apiBaseUrl}documents/bulk_edit/`)
|
||||
.flush(true)
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -521,7 +521,7 @@ describe('BulkEditorComponent', () => {
|
||||
parameters: { document_type: 101 },
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -553,7 +553,7 @@ describe('BulkEditorComponent', () => {
|
||||
.expectOne(`${environment.apiBaseUrl}documents/bulk_edit/`)
|
||||
.flush(true)
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -619,7 +619,7 @@ describe('BulkEditorComponent', () => {
|
||||
parameters: { storage_path: 101 },
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -651,7 +651,7 @@ describe('BulkEditorComponent', () => {
|
||||
.expectOne(`${environment.apiBaseUrl}documents/bulk_edit/`)
|
||||
.flush(true)
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -717,7 +717,7 @@ describe('BulkEditorComponent', () => {
|
||||
parameters: { add_custom_fields: [101], remove_custom_fields: [102] },
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -749,7 +749,7 @@ describe('BulkEditorComponent', () => {
|
||||
.expectOne(`${environment.apiBaseUrl}documents/bulk_edit/`)
|
||||
.flush(true)
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -858,7 +858,7 @@ describe('BulkEditorComponent', () => {
|
||||
documents: [3, 4],
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -951,7 +951,7 @@ describe('BulkEditorComponent', () => {
|
||||
documents: [3, 4],
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -986,7 +986,7 @@ describe('BulkEditorComponent', () => {
|
||||
source_mode: 'latest_version',
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -1027,7 +1027,7 @@ describe('BulkEditorComponent', () => {
|
||||
metadata_document_id: 3,
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -1046,7 +1046,7 @@ describe('BulkEditorComponent', () => {
|
||||
delete_originals: true,
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -1067,7 +1067,7 @@ describe('BulkEditorComponent', () => {
|
||||
archive_fallback: true,
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -1153,7 +1153,7 @@ describe('BulkEditorComponent', () => {
|
||||
},
|
||||
})
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
@@ -1460,7 +1460,7 @@ describe('BulkEditorComponent', () => {
|
||||
expect(toastServiceShowInfoSpy).toHaveBeenCalled()
|
||||
expect(listReloadSpy).toHaveBeenCalled()
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
) // list reload
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id`
|
||||
|
||||
@@ -16,7 +16,6 @@ import { first, map, Observable, Subject, switchMap, takeUntil } from 'rxjs'
|
||||
import { ConfirmDialogComponent } from 'src/app/components/common/confirm-dialog/confirm-dialog.component'
|
||||
import { CustomField } from 'src/app/data/custom-field'
|
||||
import { MatchingModel } from 'src/app/data/matching-model'
|
||||
import { SelectionDataItem } from 'src/app/data/results'
|
||||
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service'
|
||||
@@ -33,6 +32,7 @@ import {
|
||||
DocumentBulkEditMethod,
|
||||
DocumentService,
|
||||
MergeDocumentsRequest,
|
||||
SelectionDataItem,
|
||||
} from 'src/app/services/rest/document.service'
|
||||
import { SavedViewService } from 'src/app/services/rest/saved-view.service'
|
||||
import { ShareLinkBundleService } from 'src/app/services/rest/share-link-bundle.service'
|
||||
|
||||
@@ -56,13 +56,20 @@ $paperless-card-breakpoints: (
|
||||
|
||||
.sticky-top {
|
||||
z-index: 990; // below main navbar
|
||||
top: calc(7rem - 2px); // height of navbar (mobile)
|
||||
top: calc(7rem - 2px); // height of navbar + search row (mobile)
|
||||
transition: top 0.2s ease;
|
||||
|
||||
@media (min-width: 580px) {
|
||||
top: 3.5rem; // height of navbar
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 579.98px) {
|
||||
:host-context(main.mobile-search-hidden) .sticky-top {
|
||||
top: calc(3.5rem - 2px); // height of navbar only when search is hidden
|
||||
}
|
||||
}
|
||||
|
||||
.table .form-check {
|
||||
padding: 0.2rem;
|
||||
min-height: 0;
|
||||
|
||||
@@ -76,7 +76,6 @@ import {
|
||||
FILTER_TITLE_CONTENT,
|
||||
NEGATIVE_NULL_FILTER_VALUE,
|
||||
} from 'src/app/data/filter-rule-type'
|
||||
import { SelectionData, SelectionDataItem } from 'src/app/data/results'
|
||||
import {
|
||||
PermissionAction,
|
||||
PermissionType,
|
||||
@@ -85,7 +84,11 @@ import {
|
||||
import { CorrespondentService } from 'src/app/services/rest/correspondent.service'
|
||||
import { CustomFieldsService } from 'src/app/services/rest/custom-fields.service'
|
||||
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
|
||||
import { DocumentService } from 'src/app/services/rest/document.service'
|
||||
import {
|
||||
DocumentService,
|
||||
SelectionData,
|
||||
SelectionDataItem,
|
||||
} from 'src/app/services/rest/document.service'
|
||||
import { SearchService } from 'src/app/services/rest/search.service'
|
||||
import { StoragePathService } from 'src/app/services/rest/storage-path.service'
|
||||
import { TagService } from 'src/app/services/rest/tag.service'
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import { Document } from './document'
|
||||
|
||||
export interface Results<T> {
|
||||
count: number
|
||||
|
||||
@@ -7,20 +5,3 @@ export interface Results<T> {
|
||||
|
||||
all: number[]
|
||||
}
|
||||
|
||||
export interface SelectionDataItem {
|
||||
id: number
|
||||
document_count: number
|
||||
}
|
||||
|
||||
export interface SelectionData {
|
||||
selected_storage_paths: SelectionDataItem[]
|
||||
selected_correspondents: SelectionDataItem[]
|
||||
selected_tags: SelectionDataItem[]
|
||||
selected_document_types: SelectionDataItem[]
|
||||
selected_custom_fields: SelectionDataItem[]
|
||||
}
|
||||
|
||||
export interface DocumentResults extends Results<Document> {
|
||||
selection_data?: SelectionData
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ import {
|
||||
FILTER_HAS_TAGS_ANY,
|
||||
} from '../data/filter-rule-type'
|
||||
import { SavedView } from '../data/saved-view'
|
||||
import { DOCUMENT_LIST_SERVICE } from '../data/storage-keys'
|
||||
import { SETTINGS_KEYS } from '../data/ui-settings'
|
||||
import { PermissionsGuard } from '../guards/permissions.guard'
|
||||
import { DocumentListViewService } from './document-list-view.service'
|
||||
@@ -126,10 +127,13 @@ describe('DocumentListViewService', () => {
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
documentListViewService.reload()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(full_results)
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.isReloading).toBeFalsy()
|
||||
expect(documentListViewService.activeSavedViewId).toBeNull()
|
||||
@@ -141,12 +145,12 @@ describe('DocumentListViewService', () => {
|
||||
it('should handle error on page request out of range', () => {
|
||||
documentListViewService.currentPage = 50
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=50&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=50&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush([], { status: 404, statusText: 'Unexpected error' })
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
@@ -163,7 +167,7 @@ describe('DocumentListViewService', () => {
|
||||
]
|
||||
documentListViewService.setFilterRules(filterRulesAny)
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__in=${tags__id__in}`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__in=${tags__id__in}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(
|
||||
@@ -171,13 +175,13 @@ describe('DocumentListViewService', () => {
|
||||
{ status: 404, statusText: 'Unexpected error' }
|
||||
)
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
// reset the list
|
||||
documentListViewService.setFilterRules([])
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
})
|
||||
|
||||
@@ -185,7 +189,7 @@ describe('DocumentListViewService', () => {
|
||||
documentListViewService.currentPage = 1
|
||||
documentListViewService.sortField = 'custom_field_999'
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-custom_field_999&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-custom_field_999&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(
|
||||
@@ -194,7 +198,7 @@ describe('DocumentListViewService', () => {
|
||||
)
|
||||
// resets itself
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
})
|
||||
|
||||
@@ -209,7 +213,7 @@ describe('DocumentListViewService', () => {
|
||||
]
|
||||
documentListViewService.setFilterRules(filterRulesAny)
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__in=${tags__id__in}`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__in=${tags__id__in}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush('Generic error', { status: 404, statusText: 'Unexpected error' })
|
||||
@@ -217,7 +221,7 @@ describe('DocumentListViewService', () => {
|
||||
// reset the list
|
||||
documentListViewService.setFilterRules([])
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
})
|
||||
|
||||
@@ -226,7 +230,7 @@ describe('DocumentListViewService', () => {
|
||||
expect(documentListViewService.sortReverse).toBeTruthy()
|
||||
documentListViewService.setSort('added', false)
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=added&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=added&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.sortField).toEqual('added')
|
||||
@@ -234,17 +238,40 @@ describe('DocumentListViewService', () => {
|
||||
|
||||
documentListViewService.sortField = 'created'
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=created&truncate_content=true`
|
||||
)
|
||||
expect(documentListViewService.sortField).toEqual('created')
|
||||
documentListViewService.sortReverse = true
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.sortReverse).toBeTruthy()
|
||||
})
|
||||
|
||||
it('restores only known list view state fields from local storage', () => {
|
||||
try {
|
||||
localStorage.setItem(
|
||||
DOCUMENT_LIST_SERVICE.CURRENT_VIEW_CONFIG,
|
||||
'{"currentPage":3,"sortField":"title","sortReverse":false,"__proto__":{"polluted":true},"injected":"ignored"}'
|
||||
)
|
||||
|
||||
const restoredService = TestBed.runInInjectionContext(
|
||||
() => new DocumentListViewService()
|
||||
)
|
||||
|
||||
expect(restoredService.currentPage).toEqual(3)
|
||||
expect(restoredService.sortField).toEqual('title')
|
||||
expect(restoredService.sortReverse).toBeFalsy()
|
||||
expect(
|
||||
(restoredService as any).activeListViewState.injected
|
||||
).toBeUndefined()
|
||||
expect(({} as any).polluted).toBeUndefined()
|
||||
} finally {
|
||||
localStorage.removeItem(DOCUMENT_LIST_SERVICE.CURRENT_VIEW_CONFIG)
|
||||
}
|
||||
})
|
||||
|
||||
it('should load from query params', () => {
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
const page = 2
|
||||
@@ -259,7 +286,7 @@ describe('DocumentListViewService', () => {
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=${page}&page_size=${
|
||||
documentListViewService.pageSize
|
||||
}&ordering=${reverse ? '-' : ''}${sort}&truncate_content=true&include_selection_data=true`
|
||||
}&ordering=${reverse ? '-' : ''}${sort}&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.currentPage).toEqual(page)
|
||||
@@ -276,7 +303,7 @@ describe('DocumentListViewService', () => {
|
||||
}
|
||||
documentListViewService.loadFromQueryParams(convertToParamMap(params))
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-added&truncate_content=true&include_selection_data=true&tags__id__all=${tags__id__all}`
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-added&truncate_content=true&tags__id__all=${tags__id__all}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
expect(documentListViewService.filterRules).toEqual([
|
||||
@@ -286,12 +313,15 @@ describe('DocumentListViewService', () => {
|
||||
},
|
||||
])
|
||||
req.flush(full_results)
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
})
|
||||
|
||||
it('should use filter rules to update query params', () => {
|
||||
documentListViewService.setFilterRules(filterRules)
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=${tags__id__all}`
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-created&truncate_content=true&tags__id__all=${tags__id__all}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
})
|
||||
@@ -300,26 +330,34 @@ describe('DocumentListViewService', () => {
|
||||
documentListViewService.currentPage = 2
|
||||
let req = httpTestingController.expectOne((request) =>
|
||||
request.urlWithParams.startsWith(
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(full_results)
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
req.flush([])
|
||||
|
||||
documentListViewService.setFilterRules(filterRules, true)
|
||||
|
||||
const filteredReqs = httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=${tags__id__all}`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=${tags__id__all}`
|
||||
)
|
||||
expect(filteredReqs).toHaveLength(1)
|
||||
filteredReqs[0].flush(full_results)
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
req.flush([])
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
})
|
||||
|
||||
it('should support quick filter', () => {
|
||||
documentListViewService.quickFilter(filterRules)
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=${tags__id__all}`
|
||||
`${environment.apiBaseUrl}documents/?page=${documentListViewService.currentPage}&page_size=${documentListViewService.pageSize}&ordering=-created&truncate_content=true&tags__id__all=${tags__id__all}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
})
|
||||
@@ -342,21 +380,21 @@ describe('DocumentListViewService', () => {
|
||||
convertToParamMap(params)
|
||||
)
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=${page}&page_size=${documentListViewService.pageSize}&ordering=-added&truncate_content=true&include_selection_data=true&tags__id__all=${tags__id__all}`
|
||||
`${environment.apiBaseUrl}documents/?page=${page}&page_size=${documentListViewService.pageSize}&ordering=-added&truncate_content=true&tags__id__all=${tags__id__all}`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
// reset the list
|
||||
documentListViewService.currentPage = 1
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-added&truncate_content=true&include_selection_data=true&tags__id__all=9`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-added&truncate_content=true&tags__id__all=9`
|
||||
)
|
||||
documentListViewService.setFilterRules([])
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-added&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-added&truncate_content=true`
|
||||
)
|
||||
documentListViewService.sortField = 'created'
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
documentListViewService.activateSavedView(null)
|
||||
})
|
||||
@@ -364,18 +402,21 @@ describe('DocumentListViewService', () => {
|
||||
it('should support navigating next / previous', () => {
|
||||
documentListViewService.setFilterRules([])
|
||||
let req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
documentListViewService.pageSize = 3
|
||||
req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush({
|
||||
count: 3,
|
||||
results: documents.slice(0, 3),
|
||||
})
|
||||
httpTestingController
|
||||
.expectOne(`${environment.apiBaseUrl}documents/selection_data/`)
|
||||
.flush([])
|
||||
expect(documentListViewService.hasNext(documents[0].id)).toBeTruthy()
|
||||
expect(documentListViewService.hasPrevious(documents[0].id)).toBeFalsy()
|
||||
documentListViewService.getNext(documents[0].id).subscribe((docId) => {
|
||||
@@ -422,7 +463,7 @@ describe('DocumentListViewService', () => {
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
documentListViewService.pageSize = 3
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
jest
|
||||
.spyOn(documentListViewService, 'getLastPage')
|
||||
@@ -437,7 +478,7 @@ describe('DocumentListViewService', () => {
|
||||
expect(reloadSpy).toHaveBeenCalled()
|
||||
expect(documentListViewService.currentPage).toEqual(2)
|
||||
const reqs = httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(reqs.length).toBeGreaterThan(0)
|
||||
})
|
||||
@@ -472,11 +513,11 @@ describe('DocumentListViewService', () => {
|
||||
.mockReturnValue(documents)
|
||||
documentListViewService.currentPage = 2
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
documentListViewService.pageSize = 3
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=2&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
const reloadSpy = jest.spyOn(documentListViewService, 'reload')
|
||||
documentListViewService.getPrevious(1).subscribe({
|
||||
@@ -486,7 +527,7 @@ describe('DocumentListViewService', () => {
|
||||
expect(reloadSpy).toHaveBeenCalled()
|
||||
expect(documentListViewService.currentPage).toEqual(1)
|
||||
const reqs = httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(reqs.length).toBeGreaterThan(0)
|
||||
})
|
||||
@@ -499,10 +540,13 @@ describe('DocumentListViewService', () => {
|
||||
it('should support select a document', () => {
|
||||
documentListViewService.reload()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(full_results)
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
documentListViewService.toggleSelected(documents[0])
|
||||
expect(documentListViewService.isSelected(documents[0])).toBeTruthy()
|
||||
documentListViewService.toggleSelected(documents[0])
|
||||
@@ -524,13 +568,16 @@ describe('DocumentListViewService', () => {
|
||||
it('should support select page', () => {
|
||||
documentListViewService.pageSize = 3
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=3&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush({
|
||||
count: 3,
|
||||
results: documents.slice(0, 3),
|
||||
})
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
documentListViewService.selectPage()
|
||||
expect(documentListViewService.selected.size).toEqual(3)
|
||||
expect(documentListViewService.isSelected(documents[5])).toBeFalsy()
|
||||
@@ -539,10 +586,13 @@ describe('DocumentListViewService', () => {
|
||||
it('should support select range', () => {
|
||||
documentListViewService.reload()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
req.flush(full_results)
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/selection_data/`
|
||||
)
|
||||
documentListViewService.toggleSelected(documents[0])
|
||||
expect(documentListViewService.isSelected(documents[0])).toBeTruthy()
|
||||
documentListViewService.selectRangeTo(documents[2])
|
||||
@@ -562,7 +612,7 @@ describe('DocumentListViewService', () => {
|
||||
|
||||
documentListViewService.setFilterRules(filterRules)
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=9`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=9`
|
||||
)
|
||||
const reqs = httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=100000&fields=id&tags__id__all=9`
|
||||
@@ -578,7 +628,7 @@ describe('DocumentListViewService', () => {
|
||||
const cancelSpy = jest.spyOn(documentListViewService, 'cancelPending')
|
||||
documentListViewService.reload()
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&tags__id__all=9`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&tags__id__all=9`
|
||||
)
|
||||
expect(cancelSpy).toHaveBeenCalled()
|
||||
})
|
||||
@@ -597,7 +647,7 @@ describe('DocumentListViewService', () => {
|
||||
documentListViewService.setFilterRules([])
|
||||
expect(documentListViewService.sortField).toEqual('created')
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
})
|
||||
|
||||
@@ -624,11 +674,11 @@ describe('DocumentListViewService', () => {
|
||||
expect(localStorageSpy).toHaveBeenCalled()
|
||||
// reload triggered
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
documentListViewService.displayFields = null
|
||||
httpTestingController.match(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
expect(documentListViewService.displayFields).toEqual(
|
||||
DEFAULT_DISPLAY_FIELDS.filter((f) => f.id !== DisplayField.ADDED).map(
|
||||
@@ -668,7 +718,7 @@ describe('DocumentListViewService', () => {
|
||||
it('should generate quick filter URL preserving default state', () => {
|
||||
documentListViewService.reload()
|
||||
httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true`
|
||||
`${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true`
|
||||
)
|
||||
const urlTree = documentListViewService.getQuickFilterUrl(filterRules)
|
||||
expect(urlTree).toBeDefined()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { Injectable, inject } from '@angular/core'
|
||||
import { ParamMap, Router, UrlTree } from '@angular/router'
|
||||
import { Observable, Subject, takeUntil } from 'rxjs'
|
||||
import { Observable, Subject, first, takeUntil } from 'rxjs'
|
||||
import {
|
||||
DEFAULT_DISPLAY_FIELDS,
|
||||
DisplayField,
|
||||
@@ -8,7 +8,6 @@ import {
|
||||
Document,
|
||||
} from '../data/document'
|
||||
import { FilterRule } from '../data/filter-rule'
|
||||
import { DocumentResults, SelectionData } from '../data/results'
|
||||
import { SavedView } from '../data/saved-view'
|
||||
import { DOCUMENT_LIST_SERVICE } from '../data/storage-keys'
|
||||
import { SETTINGS_KEYS } from '../data/ui-settings'
|
||||
@@ -18,13 +17,27 @@ import {
|
||||
isFullTextFilterRule,
|
||||
} from '../utils/filter-rules'
|
||||
import { paramsFromViewState, paramsToViewState } from '../utils/query-params'
|
||||
import { DocumentService } from './rest/document.service'
|
||||
import { DocumentService, SelectionData } from './rest/document.service'
|
||||
import { SettingsService } from './settings.service'
|
||||
|
||||
const LIST_DEFAULT_DISPLAY_FIELDS: DisplayField[] = DEFAULT_DISPLAY_FIELDS.map(
|
||||
(f) => f.id
|
||||
).filter((f) => f !== DisplayField.ADDED)
|
||||
|
||||
const RESTORABLE_LIST_VIEW_STATE_KEYS: (keyof ListViewState)[] = [
|
||||
'title',
|
||||
'documents',
|
||||
'currentPage',
|
||||
'collectionSize',
|
||||
'sortField',
|
||||
'sortReverse',
|
||||
'filterRules',
|
||||
'selected',
|
||||
'pageSize',
|
||||
'displayMode',
|
||||
'displayFields',
|
||||
]
|
||||
|
||||
/**
|
||||
* Captures the current state of the list view.
|
||||
*/
|
||||
@@ -113,6 +126,32 @@ export class DocumentListViewService {
|
||||
|
||||
private displayFieldsInitialized: boolean = false
|
||||
|
||||
private restoreListViewState(savedState: unknown): ListViewState {
|
||||
const newState = this.defaultListViewState()
|
||||
|
||||
if (
|
||||
!savedState ||
|
||||
typeof savedState !== 'object' ||
|
||||
Array.isArray(savedState)
|
||||
) {
|
||||
return newState
|
||||
}
|
||||
|
||||
const parsedState = savedState as Partial<
|
||||
Record<keyof ListViewState, unknown>
|
||||
>
|
||||
const mutableState = newState as Record<keyof ListViewState, unknown>
|
||||
|
||||
for (const key of RESTORABLE_LIST_VIEW_STATE_KEYS) {
|
||||
const value = parsedState[key]
|
||||
if (value != null) {
|
||||
mutableState[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
return newState
|
||||
}
|
||||
|
||||
get activeSavedViewId() {
|
||||
return this._activeSavedViewId
|
||||
}
|
||||
@@ -128,14 +167,7 @@ export class DocumentListViewService {
|
||||
if (documentListViewConfigJson) {
|
||||
try {
|
||||
let savedState: ListViewState = JSON.parse(documentListViewConfigJson)
|
||||
// Remove null elements from the restored state
|
||||
Object.keys(savedState).forEach((k) => {
|
||||
if (savedState[k] == null) {
|
||||
delete savedState[k]
|
||||
}
|
||||
})
|
||||
// only use restored state attributes instead of defaults if they are not null
|
||||
let newState = Object.assign(this.defaultListViewState(), savedState)
|
||||
let newState = this.restoreListViewState(savedState)
|
||||
this.listViewStates.set(null, newState)
|
||||
} catch (e) {
|
||||
localStorage.removeItem(DOCUMENT_LIST_SERVICE.CURRENT_VIEW_CONFIG)
|
||||
@@ -261,17 +293,27 @@ export class DocumentListViewService {
|
||||
activeListViewState.sortField,
|
||||
activeListViewState.sortReverse,
|
||||
activeListViewState.filterRules,
|
||||
{ truncate_content: true, include_selection_data: true }
|
||||
{ truncate_content: true }
|
||||
)
|
||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||
.subscribe({
|
||||
next: (result) => {
|
||||
const resultWithSelectionData = result as DocumentResults
|
||||
this.initialized = true
|
||||
this.isReloading = false
|
||||
activeListViewState.collectionSize = result.count
|
||||
activeListViewState.documents = result.results
|
||||
this.selectionData = resultWithSelectionData.selection_data ?? null
|
||||
|
||||
this.documentService
|
||||
.getSelectionData(result.all)
|
||||
.pipe(first())
|
||||
.subscribe({
|
||||
next: (selectionData) => {
|
||||
this.selectionData = selectionData
|
||||
},
|
||||
error: () => {
|
||||
this.selectionData = null
|
||||
},
|
||||
})
|
||||
|
||||
if (updateQueryParams && !this._activeSavedViewId) {
|
||||
let base = ['/documents']
|
||||
|
||||
@@ -12,7 +12,7 @@ import {
|
||||
import { DocumentMetadata } from 'src/app/data/document-metadata'
|
||||
import { DocumentSuggestions } from 'src/app/data/document-suggestions'
|
||||
import { FilterRule } from 'src/app/data/filter-rule'
|
||||
import { Results, SelectionData } from 'src/app/data/results'
|
||||
import { Results } from 'src/app/data/results'
|
||||
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
import { queryParamsFromFilterRules } from '../../utils/query-params'
|
||||
import {
|
||||
@@ -24,6 +24,19 @@ import { SettingsService } from '../settings.service'
|
||||
import { AbstractPaperlessService } from './abstract-paperless-service'
|
||||
import { CustomFieldsService } from './custom-fields.service'
|
||||
|
||||
export interface SelectionDataItem {
|
||||
id: number
|
||||
document_count: number
|
||||
}
|
||||
|
||||
export interface SelectionData {
|
||||
selected_storage_paths: SelectionDataItem[]
|
||||
selected_correspondents: SelectionDataItem[]
|
||||
selected_tags: SelectionDataItem[]
|
||||
selected_document_types: SelectionDataItem[]
|
||||
selected_custom_fields: SelectionDataItem[]
|
||||
}
|
||||
|
||||
export enum BulkEditSourceMode {
|
||||
LATEST_VERSION = 'latest_version',
|
||||
EXPLICIT_SELECTION = 'explicit_selection',
|
||||
|
||||
@@ -166,6 +166,23 @@ describe('SettingsService', () => {
|
||||
expect(settingsService.get(SETTINGS_KEYS.THEME_COLOR)).toEqual('#9fbf2f')
|
||||
})
|
||||
|
||||
it('ignores unsafe top-level keys from loaded settings', () => {
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}ui_settings/`
|
||||
)
|
||||
const payload = JSON.parse(
|
||||
JSON.stringify(ui_settings).replace(
|
||||
'"settings":{',
|
||||
'"settings":{"__proto__":{"polluted":"yes"},'
|
||||
)
|
||||
)
|
||||
payload.settings.app_title = 'Safe Title'
|
||||
req.flush(payload)
|
||||
|
||||
expect(settingsService.get(SETTINGS_KEYS.APP_TITLE)).toEqual('Safe Title')
|
||||
expect(({} as any).polluted).toBeUndefined()
|
||||
})
|
||||
|
||||
it('correctly allows updating settings of various types', () => {
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}ui_settings/`
|
||||
|
||||
@@ -276,6 +276,8 @@ const ISO_LANGUAGE_OPTION: LanguageOption = {
|
||||
dateInputFormat: 'yyyy-mm-dd',
|
||||
}
|
||||
|
||||
const UNSAFE_OBJECT_KEYS = new Set(['__proto__', 'prototype', 'constructor'])
|
||||
|
||||
@Injectable({
|
||||
providedIn: 'root',
|
||||
})
|
||||
@@ -291,7 +293,7 @@ export class SettingsService {
|
||||
|
||||
protected baseUrl: string = environment.apiBaseUrl + 'ui_settings/'
|
||||
|
||||
private settings: Object = {}
|
||||
private settings: Record<string, any> = {}
|
||||
currentUser: User
|
||||
|
||||
public settingsSaved: EventEmitter<any> = new EventEmitter()
|
||||
@@ -320,6 +322,21 @@ export class SettingsService {
|
||||
this._renderer = rendererFactory.createRenderer(null, null)
|
||||
}
|
||||
|
||||
private isSafeObjectKey(key: string): boolean {
|
||||
return !UNSAFE_OBJECT_KEYS.has(key)
|
||||
}
|
||||
|
||||
private assignSafeSettings(source: Record<string, any>) {
|
||||
if (!source || typeof source !== 'object' || Array.isArray(source)) {
|
||||
return
|
||||
}
|
||||
|
||||
for (const key of Object.keys(source)) {
|
||||
if (!this.isSafeObjectKey(key)) continue
|
||||
this.settings[key] = source[key]
|
||||
}
|
||||
}
|
||||
|
||||
// this is called by the app initializer in app.module
|
||||
public initializeSettings(): Observable<UiSettings> {
|
||||
return this.http.get<UiSettings>(this.baseUrl).pipe(
|
||||
@@ -338,7 +355,7 @@ export class SettingsService {
|
||||
})
|
||||
}),
|
||||
tap((uisettings) => {
|
||||
Object.assign(this.settings, uisettings.settings)
|
||||
this.assignSafeSettings(uisettings.settings)
|
||||
if (this.get(SETTINGS_KEYS.APP_TITLE)?.length) {
|
||||
environment.appTitle = this.get(SETTINGS_KEYS.APP_TITLE)
|
||||
}
|
||||
@@ -533,7 +550,11 @@ export class SettingsService {
|
||||
let settingObj = this.settings
|
||||
keys.forEach((keyPart, index) => {
|
||||
keyPart = keyPart.replace(/-/g, '_')
|
||||
if (!settingObj.hasOwnProperty(keyPart)) return
|
||||
if (
|
||||
!this.isSafeObjectKey(keyPart) ||
|
||||
!Object.prototype.hasOwnProperty.call(settingObj, keyPart)
|
||||
)
|
||||
return
|
||||
if (index == keys.length - 1) value = settingObj[keyPart]
|
||||
else settingObj = settingObj[keyPart]
|
||||
})
|
||||
@@ -579,7 +600,9 @@ export class SettingsService {
|
||||
const keys = key.replace('general-settings:', '').split(':')
|
||||
keys.forEach((keyPart, index) => {
|
||||
keyPart = keyPart.replace(/-/g, '_')
|
||||
if (!settingObj.hasOwnProperty(keyPart)) settingObj[keyPart] = {}
|
||||
if (!this.isSafeObjectKey(keyPart)) return
|
||||
if (!Object.prototype.hasOwnProperty.call(settingObj, keyPart))
|
||||
settingObj[keyPart] = {}
|
||||
if (index == keys.length - 1) settingObj[keyPart] = value
|
||||
else settingObj = settingObj[keyPart]
|
||||
})
|
||||
@@ -602,7 +625,10 @@ export class SettingsService {
|
||||
|
||||
maybeMigrateSettings() {
|
||||
if (
|
||||
!this.settings.hasOwnProperty('documentListSize') &&
|
||||
!Object.prototype.hasOwnProperty.call(
|
||||
this.settings,
|
||||
'documentListSize'
|
||||
) &&
|
||||
localStorage.getItem(SETTINGS_KEYS.DOCUMENT_LIST_SIZE)
|
||||
) {
|
||||
// lets migrate
|
||||
@@ -610,8 +636,7 @@ export class SettingsService {
|
||||
const errorMessage = $localize`Unable to migrate settings to the database, please try saving manually.`
|
||||
|
||||
try {
|
||||
for (const setting in SETTINGS_KEYS) {
|
||||
const key = SETTINGS_KEYS[setting]
|
||||
for (const key of Object.values(SETTINGS_KEYS)) {
|
||||
const value = localStorage.getItem(key)
|
||||
this.set(key, value)
|
||||
}
|
||||
|
||||
@@ -3,25 +3,20 @@ from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
if not get_parser_registry().all_parsers():
|
||||
return [
|
||||
Error(
|
||||
"No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.",
|
||||
),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
@@ -32,9 +32,7 @@ from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
@@ -48,44 +46,22 @@ from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
|
||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
"""
|
||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
||||
|
||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
||||
have switched to the new style and this consumer is updated to use it
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
@@ -134,6 +110,44 @@ class ConsumerStatusShortMessage(StrEnum):
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
def should_produce_archive(
|
||||
parser: "ParserProtocol",
|
||||
mime_type: str,
|
||||
document_path: Path,
|
||||
) -> bool:
|
||||
"""Return True if a PDF/A archive should be produced for this document.
|
||||
|
||||
IMPORTANT: *parser* must be an instantiated parser, not the class.
|
||||
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
|
||||
``@property`` methods — accessing them on the class returns the descriptor
|
||||
(always truthy).
|
||||
"""
|
||||
# Must produce a PDF so the frontend can display the original format at all.
|
||||
if parser.requires_pdf_rendition:
|
||||
return True
|
||||
|
||||
# Parser cannot produce an archive (e.g. TextDocumentParser).
|
||||
if not parser.can_produce_archive:
|
||||
return False
|
||||
|
||||
generation = OcrConfig().archive_file_generation
|
||||
|
||||
if generation == ArchiveFileGenerationChoices.ALWAYS:
|
||||
return True
|
||||
if generation == ArchiveFileGenerationChoices.NEVER:
|
||||
return False
|
||||
|
||||
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
|
||||
if mime_type.startswith("image/"):
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
if is_tagged_pdf(document_path):
|
||||
return False
|
||||
text = extract_pdf_text(document_path)
|
||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
||||
return False
|
||||
|
||||
|
||||
class ConsumerPluginMixin:
|
||||
if TYPE_CHECKING:
|
||||
from logging import Logger
|
||||
@@ -226,9 +240,7 @@ class ConsumerPlugin(
|
||||
version_doc = Document(
|
||||
root_document=root_doc_frozen,
|
||||
version_index=next_version_index + 1,
|
||||
checksum=hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest(),
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
content=text or "",
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
@@ -368,18 +380,15 @@ class ConsumerPlugin(
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
tempdir = None
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
try:
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
tempdir = tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||
# For the actual work, copy the file into a tempdir
|
||||
with tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
) as tmpdir:
|
||||
self.working_copy = Path(tmpdir) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
||||
self.unmodified_original = None
|
||||
|
||||
@@ -411,7 +420,7 @@ class ConsumerPlugin(
|
||||
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||
# Save the original file for later
|
||||
self.unmodified_original = (
|
||||
Path(tempdir.name) / Path("uo") / Path(self.filename)
|
||||
Path(tmpdir) / Path("uo") / Path(self.filename)
|
||||
)
|
||||
self.unmodified_original.parent.mkdir(exist_ok=True)
|
||||
copy_file_with_basic_stats(
|
||||
@@ -422,11 +431,14 @@ class ConsumerPlugin(
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
parser_class: type[ParserProtocol] | None = (
|
||||
get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.working_copy,
|
||||
)
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
|
||||
f"Unsupported mime type {mime_type}",
|
||||
@@ -441,319 +453,284 @@ class ConsumerPlugin(
|
||||
)
|
||||
|
||||
self.run_pre_consume_script()
|
||||
except:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(
|
||||
current_progress,
|
||||
max_progress,
|
||||
) -> None: # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
document_parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# New-style parsers use __enter__/__exit__ for resource management.
|
||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
if parser_is_new_style:
|
||||
document_parser.__enter__()
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.log.debug(
|
||||
f"Parser: {document_parser.name} v{document_parser.version}",
|
||||
)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
produce_archive = should_produce_archive(
|
||||
document_parser,
|
||||
mime_type,
|
||||
self.working_copy,
|
||||
)
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
except ParseError as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
90,
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
|
||||
except ParseError as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
with set_actor(actor):
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
document.archive_checksum = compute_checksum(
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
_parser_cleanup(document_parser)
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
self.log.info(f"Document {document} consumption finished")
|
||||
@@ -849,7 +826,7 @@ class ConsumerPlugin(
|
||||
title=title[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
page_count=page_count,
|
||||
@@ -897,7 +874,7 @@ class ConsumerPlugin(
|
||||
self.metadata.view_users is not None
|
||||
or self.metadata.view_groups is not None
|
||||
or self.metadata.change_users is not None
|
||||
or self.metadata.change_users is not None
|
||||
or self.metadata.change_groups is not None
|
||||
):
|
||||
permissions = {
|
||||
"view": {
|
||||
@@ -930,7 +907,7 @@ class ConsumerPlugin(
|
||||
Path(source).open("rb") as read_file,
|
||||
Path(target).open("wb") as write_file,
|
||||
):
|
||||
write_file.write(read_file.read())
|
||||
shutil.copyfileobj(read_file, write_file)
|
||||
|
||||
# Attempt to copy file's original stats, but it's ok if we can't
|
||||
try:
|
||||
@@ -966,10 +943,9 @@ class ConsumerPreflightPlugin(
|
||||
|
||||
def pre_check_duplicate(self) -> None:
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
Using the SHA256 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = compute_checksum(Path(self.input_doc.original_file))
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
|
||||
@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
source_stat = source.stat()
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
target_checksum = compute_checksum(target)
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif (
|
||||
source_stat.st_mtime != target_stat.st_mtime
|
||||
|
||||
@@ -3,19 +3,18 @@ import shutil
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
logger = logging.getLogger("paperless.management.thumbnails")
|
||||
|
||||
|
||||
def _process_document(doc_id: int) -> None:
|
||||
document: Document = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
document.mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if parser_class is None:
|
||||
logger.warning(
|
||||
@@ -25,40 +24,9 @@ def _process_document(doc_id: int) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=None)
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
else:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
with parser_class() as parser:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
if parser_is_new_style:
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
|
||||
130
src/documents/migrations/0016_sha256_checksums.py
Normal file
130
src/documents/migrations/0016_sha256_checksums.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
|
||||
_BATCH_SIZE = 500 # documents per bulk_update call
|
||||
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as fh:
|
||||
while chunk := fh.read(_CHUNK_SIZE):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def recompute_checksums(apps, schema_editor):
|
||||
"""Recompute all document checksums from MD5 to SHA256."""
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
total = Document.objects.count()
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
|
||||
|
||||
batch: list = []
|
||||
processed = 0
|
||||
|
||||
for doc in Document.objects.only(
|
||||
"pk",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum",
|
||||
).iterator(chunk_size=_BATCH_SIZE):
|
||||
updated_fields: list[str] = []
|
||||
|
||||
# Reconstruct source path the same way Document.source_path does
|
||||
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
|
||||
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
|
||||
|
||||
if source_path.exists():
|
||||
doc.checksum = _sha256(source_path)
|
||||
updated_fields.append("checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: original file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
source_path,
|
||||
)
|
||||
|
||||
# Mirror Document.has_archive_version: archive_filename is not None
|
||||
if doc.archive_filename is not None:
|
||||
archive_path = (
|
||||
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
|
||||
).resolve()
|
||||
if archive_path.exists():
|
||||
doc.archive_checksum = _sha256(archive_path)
|
||||
updated_fields.append("archive_checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: archive file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
archive_path,
|
||||
)
|
||||
|
||||
if updated_fields:
|
||||
batch.append(doc)
|
||||
|
||||
processed += 1
|
||||
|
||||
if len(batch) >= _BATCH_SIZE:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
batch.clear()
|
||||
|
||||
if processed % _PROGRESS_INTERVAL == 0:
|
||||
logger.info(
|
||||
"SHA-256 checksum progress: %d/%d (%d%%)",
|
||||
processed,
|
||||
total,
|
||||
processed * 100 // total,
|
||||
)
|
||||
|
||||
if batch:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
|
||||
logger.info(
|
||||
"SHA-256 checksum recomputation complete: %d document(s) processed.",
|
||||
total,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0015_document_version_index_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=64,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=64,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
|
||||
]
|
||||
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=32,
|
||||
max_length=64,
|
||||
editable=False,
|
||||
help_text=_("The checksum of the original document."),
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=32,
|
||||
max_length=64,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
|
||||
@@ -3,84 +3,47 @@ from __future__ import annotations
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
||||
|
||||
# TODO: isn't there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_mime_type_supported(mime_type: str) -> bool:
|
||||
"""
|
||||
Returns True if the mime type is supported, False otherwise
|
||||
"""
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def get_default_file_extension(mime_type: str) -> str:
|
||||
"""
|
||||
Returns the default file extension for a mimetype, or
|
||||
an empty string if it could not be determined
|
||||
"""
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
||||
if parser_class is not None:
|
||||
supported = parser_class.supported_mime_types()
|
||||
if mime_type in supported:
|
||||
return supported[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
return ext if ext else ""
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:
|
||||
|
||||
def get_supported_file_extensions() -> set[str]:
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
for parser_class in get_parser_registry().all_parsers():
|
||||
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
extensions.add(ext)
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
|
||||
|
||||
def run_convert(
|
||||
input_file,
|
||||
output_file,
|
||||
|
||||
@@ -11,7 +11,6 @@ is an identity function that adds no overhead.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
@@ -30,6 +29,7 @@ from django.utils import timezone
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.utils import compute_checksum
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
@@ -218,7 +218,7 @@ def _check_original(
|
||||
|
||||
present_files.discard(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
checksum = compute_checksum(source_path)
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
@@ -255,7 +255,7 @@ def _check_archive(
|
||||
|
||||
present_files.discard(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
checksum = compute_checksum(archive_path)
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
|
||||
@@ -2,5 +2,4 @@ from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_consumer_declaration = Signal()
|
||||
document_updated = Signal()
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
@@ -35,6 +34,7 @@ from documents.consumer import AsnCheckPlugin
|
||||
from documents.consumer import ConsumerPlugin
|
||||
from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.consumer import should_produce_archive
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.double_sided import CollatePlugin
|
||||
@@ -52,25 +52,20 @@ from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowRun
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import ProgressManager
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.sanity_checker import SanityCheckFailedException
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import compute_checksum
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -310,8 +305,10 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if not parser_class:
|
||||
@@ -321,138 +318,100 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
with parser_class() as parser:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
parser_is_new_style = isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.__enter__()
|
||||
|
||||
try:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
parser.configure(ParserContext())
|
||||
parser.parse(document.source_path, mime_type)
|
||||
else:
|
||||
try:
|
||||
produce_archive = should_produce_archive(
|
||||
parser,
|
||||
mime_type,
|
||||
document.source_path,
|
||||
)
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if parser_is_new_style:
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
else:
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
checksum = compute_checksum(parser.get_archive_path())
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||
if isinstance(
|
||||
parser,
|
||||
(
|
||||
MailDocumentParser,
|
||||
RasterisedDocumentParser,
|
||||
RemoteDocumentParser,
|
||||
TextDocumentParser,
|
||||
TikaDocumentParser,
|
||||
),
|
||||
):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
@@ -583,13 +542,13 @@ def check_scheduled_workflows() -> None:
|
||||
id__in=matched_ids,
|
||||
)
|
||||
|
||||
if documents.count() > 0:
|
||||
if documents.exists():
|
||||
documents = prefilter_documents_by_workflowtrigger(
|
||||
documents,
|
||||
trigger,
|
||||
)
|
||||
|
||||
if documents.count() > 0:
|
||||
if documents.exists():
|
||||
logger.debug(
|
||||
f"Found {documents.count()} documents for trigger {trigger}",
|
||||
)
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
<p>
|
||||
{% translate "Please sign in." %}
|
||||
{% if ACCOUNT_ALLOW_SIGNUPS %}
|
||||
<br/>{% blocktrans %}Don't have an account yet? <a href="{{ signup_url }}">Sign up</a>{% endblocktrans %}
|
||||
<br/>{% translate "Don't have an account yet?" %} <a href="{{ signup_url }}">{% translate "Sign up" %}</a>
|
||||
{% endif %}
|
||||
</p>
|
||||
{% endblock form_top_content %}
|
||||
@@ -25,12 +25,12 @@
|
||||
{% translate "Username" as i18n_username %}
|
||||
{% translate "Password" as i18n_password %}
|
||||
<div class="form-floating form-stacked-top">
|
||||
<input type="text" name="login" id="inputUsername" placeholder="{{ i18n_username }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus>
|
||||
<label for="inputUsername">{{ i18n_username }}</label>
|
||||
<input type="text" name="login" id="inputUsername" placeholder="{{ i18n_username|force_escape }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus>
|
||||
<label for="inputUsername">{{ i18n_username|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-bottom">
|
||||
<input type="password" name="password" id="inputPassword" placeholder="{{ i18n_password }}" class="form-control" required>
|
||||
<label for="inputPassword">{{ i18n_password }}</label>
|
||||
<input type="password" name="password" id="inputPassword" placeholder="{{ i18n_password|force_escape }}" class="form-control" required>
|
||||
<label for="inputPassword">{{ i18n_password|force_escape }}</label>
|
||||
</div>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Sign in" %}</button>
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
{% endif %}
|
||||
{% translate "Email" as i18n_email %}
|
||||
<div class="form-floating">
|
||||
<input type="email" name="{{form.email.name}}" id="inputEmail" placeholder="{{ i18n_email }}" class="form-control" required>
|
||||
<label for="inputEmail">{{ i18n_email }}</label>
|
||||
<input type="email" name="{{form.email.name}}" id="inputEmail" placeholder="{{ i18n_email|force_escape }}" class="form-control" required>
|
||||
<label for="inputEmail">{{ i18n_email|force_escape }}</label>
|
||||
</div>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Send me instructions!" %}</button>
|
||||
|
||||
@@ -17,12 +17,12 @@
|
||||
{% translate "New Password" as i18n_new_password1 %}
|
||||
{% translate "Confirm Password" as i18n_new_password2 %}
|
||||
<div class="form-floating form-stacked-top">
|
||||
<input type="password" name="{{form.password1.name}}" id="inputPassword1" placeholder="{{ i18n_new_password1 }}" class="form-control" required>
|
||||
<label for="inputPassword1">{{ i18n_new_password1 }}</label>
|
||||
<input type="password" name="{{form.password1.name}}" id="inputPassword1" placeholder="{{ i18n_new_password1|force_escape }}" class="form-control" required>
|
||||
<label for="inputPassword1">{{ i18n_new_password1|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-bottom">
|
||||
<input type="password" name="{{form.password2.name}}" id="inputPassword2" placeholder="{{ i18n_new_password2 }}" class="form-control" required>
|
||||
<label for="inputPassword2">{{ i18n_new_password2 }}</label>
|
||||
<input type="password" name="{{form.password2.name}}" id="inputPassword2" placeholder="{{ i18n_new_password2|force_escape }}" class="form-control" required>
|
||||
<label for="inputPassword2">{{ i18n_new_password2|force_escape }}</label>
|
||||
</div>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Change my password" %}</button>
|
||||
|
||||
@@ -11,5 +11,5 @@
|
||||
|
||||
{% block form_content %}
|
||||
{% url 'account_login' as login_url %}
|
||||
<p>{% blocktranslate %}Your new password has been set. You can now <a href="{{ login_url }}">log in</a>{% endblocktranslate %}.</p>
|
||||
<p>{% translate "Your new password has been set. You can now" %} <a href="{{ login_url }}">{% translate "log in" %}</a>.</p>
|
||||
{% endblock form_content %}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
{% block form_top_content %}
|
||||
{% if not FIRST_INSTALL %}
|
||||
<p>
|
||||
{% blocktrans %}Already have an account? <a href="{{ login_url }}">Sign in</a>{% endblocktrans %}
|
||||
{% translate "Already have an account?" %} <a href="{{ login_url }}">{% translate "Sign in" %}</a>
|
||||
</p>
|
||||
{% endif %}
|
||||
{% endblock form_top_content %}
|
||||
@@ -16,7 +16,7 @@
|
||||
{% block form_content %}
|
||||
{% if FIRST_INSTALL %}
|
||||
<p>
|
||||
{% blocktrans %}Note: This is the first user account for this installation and will be granted superuser privileges.{% endblocktrans %}
|
||||
{% translate "Note: This is the first user account for this installation and will be granted superuser privileges." %}
|
||||
</p>
|
||||
{% endif %}
|
||||
{% translate "Username" as i18n_username %}
|
||||
@@ -24,20 +24,20 @@
|
||||
{% translate "Password" as i18n_password1 %}
|
||||
{% translate "Password (again)" as i18n_password2 %}
|
||||
<div class="form-floating form-stacked-top">
|
||||
<input type="text" name="username" id="inputUsername" placeholder="{{ i18n_username }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus>
|
||||
<label for="inputUsername">{{ i18n_username }}</label>
|
||||
<input type="text" name="username" id="inputUsername" placeholder="{{ i18n_username|force_escape }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus>
|
||||
<label for="inputUsername">{{ i18n_username|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-middle">
|
||||
<input type="email" name="email" id="inputEmail" placeholder="{{ i18n_email }}" class="form-control">
|
||||
<label for="inputEmail">{{ i18n_email }}</label>
|
||||
<input type="email" name="email" id="inputEmail" placeholder="{{ i18n_email|force_escape }}" class="form-control">
|
||||
<label for="inputEmail">{{ i18n_email|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-middle">
|
||||
<input type="password" name="password1" id="inputPassword1" placeholder="{{ i18n_password1 }}" class="form-control" required>
|
||||
<label for="inputPassword1">{{ i18n_password1 }}</label>
|
||||
<input type="password" name="password1" id="inputPassword1" placeholder="{{ i18n_password1|force_escape }}" class="form-control" required>
|
||||
<label for="inputPassword1">{{ i18n_password1|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-bottom">
|
||||
<input type="password" name="password2" id="inputPassword2" placeholder="{{ i18n_password2 }}" class="form-control" required>
|
||||
<label for="inputPassword2">{{ i18n_password2 }}</label>
|
||||
<input type="password" name="password2" id="inputPassword2" placeholder="{{ i18n_password2|force_escape }}" class="form-control" required>
|
||||
<label for="inputPassword2">{{ i18n_password2|force_escape }}</label>
|
||||
</div>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Sign up" %}</button>
|
||||
|
||||
@@ -9,15 +9,15 @@
|
||||
|
||||
{% block form_top_content %}
|
||||
<p>
|
||||
{% blocktranslate %}Your account is protected by two-factor authentication. Please enter an authenticator code:{% endblocktranslate %}
|
||||
{% translate "Your account is protected by two-factor authentication. Please enter an authenticator code:" %}
|
||||
</p>
|
||||
{% endblock form_top_content %}
|
||||
|
||||
{% block form_content %}
|
||||
{% translate "Code" as i18n_code %}
|
||||
<div class="form-floating">
|
||||
<input type="code" name="code" id="inputCode" autocomplete="one-time-code" placeholder="{{ i18n_code }}" class="form-control" required autofocus>
|
||||
<label for="inputCode">{{ i18n_code }}</label>
|
||||
<input type="code" name="code" id="inputCode" autocomplete="one-time-code" placeholder="{{ i18n_code|force_escape }}" class="form-control" required autofocus>
|
||||
<label for="inputCode">{{ i18n_code|force_escape }}</label>
|
||||
</div>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Sign in" %}</button>
|
||||
|
||||
@@ -7,5 +7,5 @@
|
||||
|
||||
{% block form_content %}
|
||||
{% url 'account_login' as login_url %}
|
||||
<p>{% blocktranslate %}An error occurred while attempting to login via your social network account. Back to the <a href="{{ login_url }}">login page</a>{% endblocktranslate %}</p>
|
||||
<p>{% translate "An error occurred while attempting to login via your social network account. Back to the" %} <a href="{{ login_url }}">{% translate "login page" %}</a></p>
|
||||
{% endblock form_content %}
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
|
||||
{% block form_content %}
|
||||
<p>
|
||||
{% blocktrans with provider.name as provider %}You are about to connect a new third-party account from {{ provider }}.{% endblocktrans %}
|
||||
{% filter force_escape %}
|
||||
{% blocktrans with provider=provider.name %}You are about to connect a new third-party account from {{ provider }}.{% endblocktrans %}
|
||||
{% endfilter %}
|
||||
</p>
|
||||
<div class="d-grid mt-3">
|
||||
<button class="btn btn-lg btn-primary" type="submit">{% translate "Continue" %}</button>
|
||||
|
||||
@@ -7,18 +7,20 @@
|
||||
|
||||
{% block form_content %}
|
||||
<p>
|
||||
{% blocktrans with provider_name=account.get_provider.name %}You are about to use your {{provider_name}} account to login.{% endblocktrans %}
|
||||
{% blocktrans %}As a final step, please complete the following form:{% endblocktrans %}
|
||||
{% filter force_escape %}
|
||||
{% blocktrans with provider_name=account.get_provider.name %}You are about to use your {{ provider_name }} account to login.{% endblocktrans %}
|
||||
{% endfilter %}
|
||||
{% translate "As a final step, please complete the following form:" %}
|
||||
</p>
|
||||
{% translate "Username" as i18n_username %}
|
||||
{% translate "Email (optional)" as i18n_email %}
|
||||
<div class="form-floating form-stacked-top">
|
||||
<input type="text" name="{{ form.username.name }}" id="inputUsername" placeholder="{{ i18n_username }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus value="{{ form.username.value }}">
|
||||
<label for="inputUsername">{{ i18n_username }}</label>
|
||||
<input type="text" name="{{ form.username.name }}" id="inputUsername" placeholder="{{ i18n_username|force_escape }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus value="{{ form.username.value }}">
|
||||
<label for="inputUsername">{{ i18n_username|force_escape }}</label>
|
||||
</div>
|
||||
<div class="form-floating form-stacked-bottom">
|
||||
<input type="email" name="{{ form.email.name }}" id="inputEmail" placeholder="{{ i18n_email }}" class="form-control" autocorrect="off" autocapitalize="none" autofocus value="{{ form.email.value }}">
|
||||
<label for="inputEmail">{{ i18n_email }}</label>
|
||||
<input type="email" name="{{ form.email.name }}" id="inputEmail" placeholder="{{ i18n_email|force_escape }}" class="form-control" autocorrect="off" autocapitalize="none" autofocus value="{{ form.email.value }}">
|
||||
<label for="inputEmail">{{ i18n_email|force_escape }}</label>
|
||||
</div>
|
||||
{% if redirect_field_value %}
|
||||
<input type="hidden" name="{{ redirect_field_name }}" value="{{ redirect_field_value }}" />
|
||||
|
||||
@@ -82,8 +82,8 @@ def sample_doc(
|
||||
|
||||
return DocumentFactory(
|
||||
title="test",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
content="test content",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
|
||||
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
|
||||
model = Document
|
||||
|
||||
title = factory.Faker("sentence", nb_words=4)
|
||||
checksum = factory.Faker("md5")
|
||||
checksum = factory.Faker("sha256")
|
||||
content = factory.Faker("paragraph")
|
||||
correspondent = None
|
||||
document_type = None
|
||||
|
||||
@@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
"pages": None,
|
||||
"language": None,
|
||||
"mode": None,
|
||||
"skip_archive_file": None,
|
||||
"archive_file_generation": None,
|
||||
"image_dpi": None,
|
||||
"unpaper_clean": None,
|
||||
"deskew": None,
|
||||
|
||||
@@ -1144,56 +1144,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
self.assertEqual(len(response.data["all"]), 50)
|
||||
self.assertCountEqual(response.data["all"], [d.id for d in docs])
|
||||
|
||||
def test_list_with_include_selection_data(self) -> None:
|
||||
correspondent = Correspondent.objects.create(name="c1")
|
||||
doc_type = DocumentType.objects.create(name="dt1")
|
||||
storage_path = StoragePath.objects.create(name="sp1")
|
||||
tag = Tag.objects.create(name="tag")
|
||||
|
||||
matching_doc = Document.objects.create(
|
||||
checksum="A",
|
||||
correspondent=correspondent,
|
||||
document_type=doc_type,
|
||||
storage_path=storage_path,
|
||||
)
|
||||
matching_doc.tags.add(tag)
|
||||
|
||||
non_matching_doc = Document.objects.create(checksum="B")
|
||||
non_matching_doc.tags.add(Tag.objects.create(name="other"))
|
||||
|
||||
response = self.client.get(
|
||||
f"/api/documents/?tags__id__in={tag.id}&include_selection_data=true",
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertIn("selection_data", response.data)
|
||||
|
||||
selected_correspondent = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_correspondents"]
|
||||
if item["id"] == correspondent.id
|
||||
)
|
||||
selected_tag = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_tags"]
|
||||
if item["id"] == tag.id
|
||||
)
|
||||
selected_type = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_document_types"]
|
||||
if item["id"] == doc_type.id
|
||||
)
|
||||
selected_storage_path = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_storage_paths"]
|
||||
if item["id"] == storage_path.id
|
||||
)
|
||||
|
||||
self.assertEqual(selected_correspondent["document_count"], 1)
|
||||
self.assertEqual(selected_tag["document_count"], 1)
|
||||
self.assertEqual(selected_type["document_count"], 1)
|
||||
self.assertEqual(selected_storage_path["document_count"], 1)
|
||||
|
||||
def test_statistics(self) -> None:
|
||||
doc1 = Document.objects.create(
|
||||
title="none1",
|
||||
|
||||
@@ -89,46 +89,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(len(results), 0)
|
||||
self.assertCountEqual(response.data["all"], [])
|
||||
|
||||
def test_search_with_include_selection_data(self) -> None:
|
||||
correspondent = Correspondent.objects.create(name="c1")
|
||||
doc_type = DocumentType.objects.create(name="dt1")
|
||||
storage_path = StoragePath.objects.create(name="sp1")
|
||||
tag = Tag.objects.create(name="tag")
|
||||
|
||||
matching_doc = Document.objects.create(
|
||||
title="bank statement",
|
||||
content="bank content",
|
||||
checksum="A",
|
||||
correspondent=correspondent,
|
||||
document_type=doc_type,
|
||||
storage_path=storage_path,
|
||||
)
|
||||
matching_doc.tags.add(tag)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, matching_doc)
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=bank&include_selection_data=true",
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertIn("selection_data", response.data)
|
||||
|
||||
selected_correspondent = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_correspondents"]
|
||||
if item["id"] == correspondent.id
|
||||
)
|
||||
selected_tag = next(
|
||||
item
|
||||
for item in response.data["selection_data"]["selected_tags"]
|
||||
if item["id"] == tag.id
|
||||
)
|
||||
|
||||
self.assertEqual(selected_correspondent["document_count"], 1)
|
||||
self.assertEqual(selected_tag["document_count"], 1)
|
||||
|
||||
def test_search_custom_field_ordering(self) -> None:
|
||||
custom_field = CustomField.objects.create(
|
||||
name="Sortable field",
|
||||
|
||||
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
|
||||
CONSUMER_TAG_BARCODE_SPLIT=True,
|
||||
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
||||
CELERY_TASK_ALWAYS_EAGER=True,
|
||||
OCR_MODE="skip",
|
||||
OCR_MODE="auto",
|
||||
)
|
||||
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
||||
def test_parser_check(self) -> None:
|
||||
self.assertEqual(parser_check(None), [])
|
||||
|
||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
||||
m.return_value = []
|
||||
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||
mock_registry = mock.MagicMock()
|
||||
mock_registry.all_parsers.return_value = []
|
||||
mock_registry_fn.return_value = mock_registry
|
||||
|
||||
self.assertEqual(
|
||||
parser_check(None),
|
||||
|
||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.tasks import sanity_check
|
||||
@@ -38,62 +37,106 @@ from documents.tests.utils import GetConsumerMixin
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class _BaseTestParser(DocumentParser):
|
||||
def get_settings(self) -> None:
|
||||
class _BaseNewStyleParser:
|
||||
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||
|
||||
name: str = "test-parser"
|
||||
version: str = "0.1"
|
||||
author: str = "test"
|
||||
url: str = "test"
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict:
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def score(cls, mime_type: str, filename: str, path=None):
|
||||
return 0 if mime_type in cls.supported_mime_types() else None
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._tmpdir: Path | None = None
|
||||
self._text: str | None = None
|
||||
self._archive: Path | None = None
|
||||
self._thumb: Path | None = None
|
||||
|
||||
def __enter__(self):
|
||||
self._tmpdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||
self._thumb = Path(thumb)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
if self._tmpdir and self._tmpdir.exists():
|
||||
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||
|
||||
def configure(self, context) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
Test parser doesn't do anything with context
|
||||
"""
|
||||
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
return None
|
||||
|
||||
def get_archive_path(self):
|
||||
return self._archive
|
||||
|
||||
class DummyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
||||
super().__init__(logging_group, None)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||
return self._thumb
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The Text"
|
||||
def extract_metadata(self, document_path, mime_type) -> list:
|
||||
return []
|
||||
|
||||
|
||||
class CopyParser(_BaseTestParser):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
class DummyParser(_BaseNewStyleParser):
|
||||
_ARCHIVE_SRC = (
|
||||
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||
)
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
||||
super().__init__(logging_group, progress_callback)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The text"
|
||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
||||
shutil.copy(document_path, self.archive_path)
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The Text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||
|
||||
|
||||
class FaultyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
class CopyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(document_path, self._archive)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise Exception("Generic exception.")
|
||||
|
||||
|
||||
@@ -147,38 +190,12 @@ class TestConsumer(
|
||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||
self.assertEqual(payload["data"]["status"], last_status)
|
||||
|
||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
||||
return DummyParser(
|
||||
logging_group,
|
||||
self.dirs.scratch_dir,
|
||||
self.get_test_archive_file(),
|
||||
)
|
||||
|
||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def make_faulty_generic_exception_parser(
|
||||
self,
|
||||
logging_group,
|
||||
progress_callback=None,
|
||||
):
|
||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
patcher = mock.patch("documents.consumer.get_parser_registry")
|
||||
mock_registry = patcher.start()
|
||||
mock_registry.return_value.get_parser_for_file.return_value = DummyParser
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def get_test_file(self):
|
||||
@@ -213,7 +230,11 @@ class TestConsumer(
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT=None,
|
||||
TIME_ZONE="America/Chicago",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
def testNormalOperation(self) -> None:
|
||||
filename = self.get_test_file()
|
||||
|
||||
@@ -244,8 +265,14 @@ class TestConsumer(
|
||||
|
||||
self.assertIsFile(document.archive_path)
|
||||
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
self.assertEqual(
|
||||
document.checksum,
|
||||
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
)
|
||||
self.assertEqual(
|
||||
document.archive_checksum,
|
||||
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
)
|
||||
|
||||
self.assertIsNotFile(filename)
|
||||
|
||||
@@ -547,9 +574,9 @@ class TestConsumer(
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testNoParsers(self, m) -> None:
|
||||
m.return_value = []
|
||||
m.return_value.get_parser_for_file.return_value = None
|
||||
|
||||
with self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
@@ -560,18 +587,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testFaultyParser(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -582,18 +600,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testGenericParserException(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_generic_exception_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -624,7 +633,10 @@ class TestConsumer(
|
||||
# Database empty
|
||||
self.assertEqual(Document.objects.all().count(), 0)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
def testFilenameHandling(self) -> None:
|
||||
with self.get_consumer(
|
||||
self.get_test_file(),
|
||||
@@ -641,7 +653,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@mock.patch("documents.consumer.generate_unique_filename")
|
||||
@override_settings(FILENAME_FORMAT="{pk}")
|
||||
@override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always")
|
||||
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
||||
m.side_effect = lambda doc, archive_filename=False: Path(
|
||||
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
||||
@@ -668,7 +680,10 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m) -> None:
|
||||
filenames = ["this", "that", "now this", "i cannot decide"]
|
||||
@@ -1016,8 +1031,8 @@ class TestConsumer(
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "simple.pdf",
|
||||
@@ -1031,16 +1046,7 @@ class TestConsumer(
|
||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||
)
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": CopyParser,
|
||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = CopyParser
|
||||
|
||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||
consumer.run()
|
||||
@@ -1068,8 +1074,10 @@ class TestConsumer(
|
||||
|
||||
sanity_check()
|
||||
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
||||
def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
|
||||
mock_registry.return_value.get_parser_for_file.return_value = None
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||
@@ -1091,10 +1099,10 @@ class TestConsumer(
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
mock_consumer_declaration_send: mock.Mock,
|
||||
mock_get_parser_registry: mock.Mock,
|
||||
mock_mail_parser_parse: mock.Mock,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
) -> None:
|
||||
@@ -1106,18 +1114,11 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
from paperless_mail.signals import get_parser as mail_get_parser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
mock_consumer_declaration_send.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": mail_get_parser,
|
||||
"mime_types": {"message/rfc822": ".eml"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||
MailDocumentParser
|
||||
)
|
||||
mock_mailrule_get.return_value = mock.Mock(
|
||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||
)
|
||||
@@ -1141,6 +1142,7 @@ class TestConsumer(
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
|
||||
@@ -1288,7 +1290,14 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
def test_no_pre_consume_script(self, m) -> None:
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
m.assert_not_called()
|
||||
# Verify no pre-consume script subprocess was invoked
|
||||
# (run_subprocess may still be called by _extract_text_for_archive_check)
|
||||
script_calls = [
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
|
||||
]
|
||||
self.assertEqual(script_calls, [])
|
||||
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
||||
@@ -1304,9 +1313,16 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
|
||||
m.assert_called_once()
|
||||
self.assertTrue(m.called)
|
||||
|
||||
args, _ = m.call_args
|
||||
# Find the call that invoked the pre-consume script
|
||||
# (run_subprocess may also be called by _extract_text_for_archive_check)
|
||||
script_call = next(
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] == script.name
|
||||
)
|
||||
args, _ = script_call
|
||||
|
||||
command = args[0]
|
||||
environment = args[1]
|
||||
|
||||
189
src/documents/tests/test_consumer_archive.py
Normal file
189
src/documents/tests/test_consumer_archive.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for should_produce_archive()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from documents.consumer import should_produce_archive
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
def _parser_instance(
|
||||
*,
|
||||
can_produce: bool = True,
|
||||
requires_rendition: bool = False,
|
||||
) -> MagicMock:
|
||||
"""Return a mock parser instance with the given capability flags."""
|
||||
instance = MagicMock()
|
||||
instance.can_produce_archive = can_produce
|
||||
instance.requires_pdf_rendition = requires_rendition
|
||||
return instance
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_app_config(mocker, null_app_config):
|
||||
"""Patch BaseConfig._get_config_instance for all tests in this module."""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
|
||||
class TestShouldProduceArchive:
|
||||
@pytest.mark.parametrize(
|
||||
("generation", "can_produce", "requires_rendition", "mime", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
False,
|
||||
id="never-returns-false",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="always-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-overrides-never",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
False,
|
||||
"text/plain",
|
||||
False,
|
||||
id="cannot-produce-overrides-always",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-wins-even-if-cannot-produce",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"image/tiff",
|
||||
True,
|
||||
id="auto-image-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"message/rfc822",
|
||||
False,
|
||||
id="auto-non-pdf-non-image-returns-false",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generation_setting(
|
||||
self,
|
||||
settings,
|
||||
generation: str,
|
||||
can_produce: bool, # noqa: FBT001
|
||||
requires_rendition: bool, # noqa: FBT001
|
||||
mime: str,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = generation
|
||||
parser = _parser_instance(
|
||||
can_produce=can_produce,
|
||||
requires_rendition=requires_rendition,
|
||||
)
|
||||
assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("extracted_text", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"This is a born-digital PDF with lots of text content. " * 10,
|
||||
False,
|
||||
id="born-digital-long-text-skips-archive",
|
||||
),
|
||||
pytest.param(None, True, id="no-text-scanned-produces-archive"),
|
||||
pytest.param("tiny", True, id="short-text-treated-as-scanned"),
|
||||
],
|
||||
)
|
||||
def test_auto_pdf_archive_decision(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
extracted_text: str | None,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
|
||||
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is expected
|
||||
)
|
||||
|
||||
def test_tagged_pdf_skips_archive_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is False
|
||||
)
|
||||
|
||||
def test_tagged_pdf_does_not_call_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
mock_extract.assert_not_called()
|
||||
@@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def make_models(self):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -63,8 +63,8 @@ class TestExportImport(
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -72,21 +72,21 @@ class TestExportImport(
|
||||
)
|
||||
self.d2 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="9c9691e51741c1f4f41a20896af31770",
|
||||
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
|
||||
title="wow2",
|
||||
filename="0000002.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d3 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="d38d7ed02e988e072caf924e0f3fcb76",
|
||||
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
|
||||
title="wow2",
|
||||
filename="0000003.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d4 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="82186aaa94f0b98697d704b90fd1c072",
|
||||
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
|
||||
title="wow_dec",
|
||||
filename="0000004.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -239,7 +239,7 @@ class TestExportImport(
|
||||
)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["checksum"])
|
||||
|
||||
# Generated field "content_length" should not be exported,
|
||||
@@ -253,7 +253,7 @@ class TestExportImport(
|
||||
self.assertIsFile(fname)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["archive_checksum"])
|
||||
|
||||
elif element["model"] == "documents.note":
|
||||
|
||||
@@ -277,8 +277,8 @@ class TestCommandImport(
|
||||
|
||||
Document.objects.create(
|
||||
content="Content",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
|
||||
132
src/documents/tests/test_migration_sha256_checksums.py
Normal file
132
src/documents/tests/test_migration_sha256_checksums.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import hashlib
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import connection
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import TestMigrations
|
||||
|
||||
|
||||
def _sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
class TestSha256ChecksumDataMigration(TestMigrations):
|
||||
"""recompute_checksums correctly updates document checksums from MD5 to SHA256."""
|
||||
|
||||
migrate_from = "0015_document_version_index_and_more"
|
||||
migrate_to = "0016_sha256_checksums"
|
||||
reset_sequences = True
|
||||
|
||||
ORIGINAL_CONTENT = b"original file content for sha256 migration test"
|
||||
ARCHIVE_CONTENT = b"archive file content for sha256 migration test"
|
||||
|
||||
def setUpBeforeMigration(self, apps) -> None:
|
||||
self._originals_dir = Path(tempfile.mkdtemp())
|
||||
self._archive_dir = Path(tempfile.mkdtemp())
|
||||
self._settings_override = override_settings(
|
||||
ORIGINALS_DIR=self._originals_dir,
|
||||
ARCHIVE_DIR=self._archive_dir,
|
||||
)
|
||||
self._settings_override.enable()
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
# doc1: original file present, no archive
|
||||
(settings.ORIGINALS_DIR / "doc1.txt").write_bytes(self.ORIGINAL_CONTENT)
|
||||
self.doc1_id = Document.objects.create(
|
||||
title="Doc 1",
|
||||
mime_type="text/plain",
|
||||
filename="doc1.txt",
|
||||
checksum="a" * 32,
|
||||
).pk
|
||||
|
||||
# doc2: original and archive both present
|
||||
(settings.ORIGINALS_DIR / "doc2.txt").write_bytes(self.ORIGINAL_CONTENT)
|
||||
(settings.ARCHIVE_DIR / "doc2.pdf").write_bytes(self.ARCHIVE_CONTENT)
|
||||
self.doc2_id = Document.objects.create(
|
||||
title="Doc 2",
|
||||
mime_type="text/plain",
|
||||
filename="doc2.txt",
|
||||
checksum="b" * 32,
|
||||
archive_filename="doc2.pdf",
|
||||
archive_checksum="c" * 32,
|
||||
).pk
|
||||
|
||||
# doc3: original file missing — checksum must stay unchanged
|
||||
self.doc3_id = Document.objects.create(
|
||||
title="Doc 3",
|
||||
mime_type="text/plain",
|
||||
filename="missing_original.txt",
|
||||
checksum="d" * 32,
|
||||
).pk
|
||||
|
||||
# doc4: original present, archive_filename set but archive file missing
|
||||
(settings.ORIGINALS_DIR / "doc4.txt").write_bytes(self.ORIGINAL_CONTENT)
|
||||
self.doc4_id = Document.objects.create(
|
||||
title="Doc 4",
|
||||
mime_type="text/plain",
|
||||
filename="doc4.txt",
|
||||
checksum="e" * 32,
|
||||
archive_filename="missing_archive.pdf",
|
||||
archive_checksum="f" * 32,
|
||||
).pk
|
||||
|
||||
# doc5: original present, archive_filename is None — archive_checksum must stay null
|
||||
(settings.ORIGINALS_DIR / "doc5.txt").write_bytes(self.ORIGINAL_CONTENT)
|
||||
self.doc5_id = Document.objects.create(
|
||||
title="Doc 5",
|
||||
mime_type="text/plain",
|
||||
filename="doc5.txt",
|
||||
checksum="0" * 32,
|
||||
archive_filename=None,
|
||||
archive_checksum=None,
|
||||
).pk
|
||||
|
||||
def _fixture_teardown(self) -> None:
|
||||
super()._fixture_teardown()
|
||||
# Django's SQLite backend returns [] from sequence_reset_sql(), so
|
||||
# reset_sequences=True flushes rows but never clears sqlite_sequence.
|
||||
# Explicitly delete the entry so subsequent tests start from pk=1.
|
||||
if connection.vendor == "sqlite":
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(
|
||||
"DELETE FROM sqlite_sequence WHERE name='documents_document'",
|
||||
)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
self._settings_override.disable()
|
||||
shutil.rmtree(self._originals_dir, ignore_errors=True)
|
||||
shutil.rmtree(self._archive_dir, ignore_errors=True)
|
||||
|
||||
def test_original_checksum_updated_to_sha256_when_file_exists(self) -> None:
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
doc = Document.objects.get(pk=self.doc1_id)
|
||||
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
|
||||
|
||||
def test_both_checksums_updated_when_original_and_archive_exist(self) -> None:
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
doc = Document.objects.get(pk=self.doc2_id)
|
||||
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
|
||||
self.assertEqual(doc.archive_checksum, _sha256(self.ARCHIVE_CONTENT))
|
||||
|
||||
def test_checksum_unchanged_when_original_file_missing(self) -> None:
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
doc = Document.objects.get(pk=self.doc3_id)
|
||||
self.assertEqual(doc.checksum, "d" * 32)
|
||||
|
||||
def test_archive_checksum_unchanged_when_archive_file_missing(self) -> None:
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
doc = Document.objects.get(pk=self.doc4_id)
|
||||
# Original was updated (file exists)
|
||||
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
|
||||
# Archive was not updated (file missing)
|
||||
self.assertEqual(doc.archive_checksum, "f" * 32)
|
||||
|
||||
def test_archive_checksum_stays_null_when_no_archive_filename(self) -> None:
|
||||
Document = self.apps.get_model("documents", "Document")
|
||||
doc = Document.objects.get(pk=self.doc5_id)
|
||||
self.assertIsNone(doc.archive_checksum)
|
||||
@@ -1,132 +1,16 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Parser declared for a given mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Declared parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Two parsers declared for a given mimetype
|
||||
- Second parser has a higher weight
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Second parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser1:
|
||||
pass
|
||||
|
||||
class DummyParser2:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser1,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 1,
|
||||
"parser": DummyParser2,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parsers are declared
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
m.return_value = []
|
||||
with TemporaryDirectory():
|
||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parser declared for a given mimetype
|
||||
- Parser declared for a different mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the given mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
def test_tesseract_parser(self) -> None:
|
||||
"""
|
||||
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
|
||||
@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TextDocumentParser,
|
||||
)
|
||||
|
||||
@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
|
||||
),
|
||||
]
|
||||
|
||||
# Force the app ready to notice the settings override
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
self.addCleanup(reset_parser_registry)
|
||||
|
||||
# Reset and rebuild the registry with Tika enabled.
|
||||
with override_settings(TIKA_ENABLED=True):
|
||||
reset_parser_registry()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
|
||||
def test_no_parser_for_mime(self) -> None:
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))
|
||||
|
||||
def test_default_extension(self) -> None:
|
||||
# Test no parser declared still returns a an extension
|
||||
|
||||
@@ -232,6 +232,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
from os import utime
|
||||
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
|
||||
Return a boolean value from a string representation.
|
||||
"""
|
||||
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
|
||||
"""
|
||||
Compute the SHA-256 checksum of a file.
|
||||
|
||||
Reads the file in chunks to avoid loading the entire file into memory.
|
||||
|
||||
Args:
|
||||
path (Path): Path to the file to hash.
|
||||
chunk_size (int, optional): Number of bytes to read per chunk.
|
||||
Defaults to 65536.
|
||||
|
||||
Returns:
|
||||
str: Hexadecimal SHA-256 digest of the file contents.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
OSError: If the file cannot be read.
|
||||
"""
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
@@ -7,7 +7,6 @@ import tempfile
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from collections import deque
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
@@ -159,7 +158,6 @@ from documents.models import UiSettings
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import AcknowledgeTasksPermissions
|
||||
from documents.permissions import PaperlessAdminPermissions
|
||||
from documents.permissions import PaperlessNotePermissions
|
||||
@@ -227,7 +225,7 @@ from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
@@ -838,61 +836,6 @@ class DocumentViewSet(
|
||||
"custom_field_",
|
||||
)
|
||||
|
||||
def _get_selection_data_for_queryset(self, queryset):
|
||||
correspondents = Correspondent.objects.annotate(
|
||||
document_count=Count(
|
||||
"documents",
|
||||
filter=Q(documents__in=queryset),
|
||||
distinct=True,
|
||||
),
|
||||
)
|
||||
tags = Tag.objects.annotate(
|
||||
document_count=Count(
|
||||
"documents",
|
||||
filter=Q(documents__in=queryset),
|
||||
distinct=True,
|
||||
),
|
||||
)
|
||||
document_types = DocumentType.objects.annotate(
|
||||
document_count=Count(
|
||||
"documents",
|
||||
filter=Q(documents__in=queryset),
|
||||
distinct=True,
|
||||
),
|
||||
)
|
||||
storage_paths = StoragePath.objects.annotate(
|
||||
document_count=Count(
|
||||
"documents",
|
||||
filter=Q(documents__in=queryset),
|
||||
distinct=True,
|
||||
),
|
||||
)
|
||||
custom_fields = CustomField.objects.annotate(
|
||||
document_count=Count(
|
||||
"fields__document",
|
||||
filter=Q(fields__document__in=queryset),
|
||||
distinct=True,
|
||||
),
|
||||
)
|
||||
|
||||
return {
|
||||
"selected_correspondents": [
|
||||
{"id": t.id, "document_count": t.document_count} for t in correspondents
|
||||
],
|
||||
"selected_tags": [
|
||||
{"id": t.id, "document_count": t.document_count} for t in tags
|
||||
],
|
||||
"selected_document_types": [
|
||||
{"id": t.id, "document_count": t.document_count} for t in document_types
|
||||
],
|
||||
"selected_storage_paths": [
|
||||
{"id": t.id, "document_count": t.document_count} for t in storage_paths
|
||||
],
|
||||
"selected_custom_fields": [
|
||||
{"id": t.id, "document_count": t.document_count} for t in custom_fields
|
||||
],
|
||||
}
|
||||
|
||||
def get_queryset(self):
|
||||
latest_version_content = Subquery(
|
||||
Document.objects.filter(root_document=OuterRef("pk"))
|
||||
@@ -1040,25 +983,6 @@ class DocumentViewSet(
|
||||
|
||||
return response
|
||||
|
||||
def list(self, request, *args, **kwargs):
|
||||
if not get_boolean(
|
||||
str(request.query_params.get("include_selection_data", "false")),
|
||||
):
|
||||
return super().list(request, *args, **kwargs)
|
||||
|
||||
queryset = self.filter_queryset(self.get_queryset())
|
||||
selection_data = self._get_selection_data_for_queryset(queryset)
|
||||
|
||||
page = self.paginate_queryset(queryset)
|
||||
if page is not None:
|
||||
serializer = self.get_serializer(page, many=True)
|
||||
response = self.get_paginated_response(serializer.data)
|
||||
response.data["selection_data"] = selection_data
|
||||
return response
|
||||
|
||||
serializer = self.get_serializer(queryset, many=True)
|
||||
return Response({"results": serializer.data, "selection_data": selection_data})
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
from documents import index
|
||||
|
||||
@@ -1158,17 +1082,17 @@ class DocumentViewSet(
|
||||
if not Path(file).is_file():
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
Path(file).name,
|
||||
Path(file),
|
||||
)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
||||
|
||||
try:
|
||||
with cm:
|
||||
with parser_class() as parser:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting metadata for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else: # pragma: no cover
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
@@ -2099,26 +2023,14 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
else None
|
||||
)
|
||||
|
||||
if get_boolean(
|
||||
str(
|
||||
request.query_params.get(
|
||||
"include_selection_data",
|
||||
"false",
|
||||
),
|
||||
),
|
||||
):
|
||||
result_ids = response.data.get("all", [])
|
||||
response.data["selection_data"] = (
|
||||
self._get_selection_data_for_queryset(
|
||||
Document.objects.filter(pk__in=result_ids),
|
||||
)
|
||||
)
|
||||
|
||||
return response
|
||||
except NotFound:
|
||||
raise
|
||||
except PermissionDenied as e:
|
||||
return HttpResponseForbidden(str(e.detail))
|
||||
invalid_more_like_id_message = _("Invalid more_like_id")
|
||||
if str(e.detail) == str(invalid_more_like_id_message):
|
||||
return HttpResponseForbidden(invalid_more_like_id_message)
|
||||
return HttpResponseForbidden(_("Insufficient permissions."))
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
|
||||
@@ -282,7 +282,7 @@ def execute_password_removal_action(
|
||||
passwords = action.passwords
|
||||
if not passwords:
|
||||
logger.warning(
|
||||
"Password removal action %s has no passwords configured",
|
||||
"Workflow action %s has no configured unlock values",
|
||||
action.pk,
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
@@ -321,22 +321,23 @@ def execute_password_removal_action(
|
||||
user=document.owner,
|
||||
)
|
||||
logger.info(
|
||||
"Removed password from document %s using workflow action %s",
|
||||
"Unlocked document %s using workflow action %s",
|
||||
document.pk,
|
||||
action.pk,
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
return
|
||||
except ValueError as e:
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Password removal failed for document %s with supplied password: %s",
|
||||
"Workflow action %s could not unlock document %s with one configured value",
|
||||
action.pk,
|
||||
document.pk,
|
||||
e,
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
|
||||
logger.error(
|
||||
"Password removal failed for document %s after trying all provided passwords",
|
||||
"Workflow action %s could not unlock document %s with any configured value",
|
||||
action.pk,
|
||||
document.pk,
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2026-03-21 09:25+0000\n"
|
||||
"POT-Creation-Date: 2026-03-26 14:37+0000\n"
|
||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: English\n"
|
||||
@@ -1300,8 +1300,8 @@ msgid "workflow runs"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:463 documents/serialisers.py:815
|
||||
#: documents/serialisers.py:2501 documents/views.py:1992
|
||||
#: paperless_mail/serialisers.py:143
|
||||
#: documents/serialisers.py:2501 documents/views.py:1990
|
||||
#: documents/views.py:2033 paperless_mail/serialisers.py:143
|
||||
msgid "Insufficient permissions."
|
||||
msgstr ""
|
||||
|
||||
@@ -1341,7 +1341,7 @@ msgstr ""
|
||||
msgid "Duplicate document identifiers are not allowed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:2587 documents/views.py:3598
|
||||
#: documents/serialisers.py:2587 documents/views.py:3599
|
||||
#, python-format
|
||||
msgid "Documents not found: %(ids)s"
|
||||
msgstr ""
|
||||
@@ -1383,13 +1383,18 @@ msgid "Please sign in."
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/login.html:12
|
||||
#, python-format
|
||||
msgid "Don't have an account yet? <a href=\"%(signup_url)s\">Sign up</a>"
|
||||
msgid "Don't have an account yet?"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/login.html:12
|
||||
#: documents/templates/account/signup.html:43
|
||||
#: documents/templates/socialaccount/signup.html:29
|
||||
msgid "Sign up"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/login.html:25
|
||||
#: documents/templates/account/signup.html:22
|
||||
#: documents/templates/socialaccount/signup.html:13
|
||||
#: documents/templates/socialaccount/signup.html:15
|
||||
msgid "Username"
|
||||
msgstr ""
|
||||
|
||||
@@ -1399,6 +1404,7 @@ msgid "Password"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/login.html:36
|
||||
#: documents/templates/account/signup.html:11
|
||||
#: documents/templates/mfa/authenticate.html:23
|
||||
msgid "Sign in"
|
||||
msgstr ""
|
||||
@@ -1477,10 +1483,11 @@ msgid "Password reset complete."
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/password_reset_from_key_done.html:14
|
||||
#, python-format
|
||||
msgid ""
|
||||
"Your new password has been set. You can now <a href=\"%(login_url)s\">log "
|
||||
"in</a>"
|
||||
msgid "Your new password has been set. You can now"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/password_reset_from_key_done.html:14
|
||||
msgid "log in"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/signup.html:5
|
||||
@@ -1488,8 +1495,7 @@ msgid "Paperless-ngx sign up"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/signup.html:11
|
||||
#, python-format
|
||||
msgid "Already have an account? <a href=\"%(login_url)s\">Sign in</a>"
|
||||
msgid "Already have an account?"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/signup.html:19
|
||||
@@ -1499,7 +1505,7 @@ msgid ""
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/signup.html:23
|
||||
#: documents/templates/socialaccount/signup.html:14
|
||||
#: documents/templates/socialaccount/signup.html:16
|
||||
msgid "Email (optional)"
|
||||
msgstr ""
|
||||
|
||||
@@ -1507,11 +1513,6 @@ msgstr ""
|
||||
msgid "Password (again)"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/account/signup.html:43
|
||||
#: documents/templates/socialaccount/signup.html:27
|
||||
msgid "Sign up"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/index.html:61
|
||||
msgid "Paperless-ngx is loading..."
|
||||
msgstr ""
|
||||
@@ -1556,18 +1557,21 @@ msgid "Paperless-ngx social account sign in"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/authentication_error.html:10
|
||||
#, python-format
|
||||
msgid ""
|
||||
"An error occurred while attempting to login via your social network account. "
|
||||
"Back to the <a href=\"%(login_url)s\">login page</a>"
|
||||
"Back to the"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/login.html:10
|
||||
#: documents/templates/socialaccount/authentication_error.html:10
|
||||
msgid "login page"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/login.html:11
|
||||
#, python-format
|
||||
msgid "You are about to connect a new third-party account from %(provider)s."
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/login.html:13
|
||||
#: documents/templates/socialaccount/login.html:15
|
||||
msgid "Continue"
|
||||
msgstr ""
|
||||
|
||||
@@ -1575,12 +1579,12 @@ msgstr ""
|
||||
msgid "Paperless-ngx social account sign up"
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/signup.html:10
|
||||
#: documents/templates/socialaccount/signup.html:11
|
||||
#, python-format
|
||||
msgid "You are about to use your %(provider_name)s account to login."
|
||||
msgstr ""
|
||||
|
||||
#: documents/templates/socialaccount/signup.html:11
|
||||
#: documents/templates/socialaccount/signup.html:13
|
||||
msgid "As a final step, please complete the following form:"
|
||||
msgstr ""
|
||||
|
||||
@@ -1605,24 +1609,24 @@ msgstr ""
|
||||
msgid "Unable to parse URI {value}"
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:1985
|
||||
#: documents/views.py:1983 documents/views.py:2030
|
||||
msgid "Invalid more_like_id"
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3610
|
||||
#: documents/views.py:3611
|
||||
#, python-format
|
||||
msgid "Insufficient permissions to share document %(id)s."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3653
|
||||
#: documents/views.py:3654
|
||||
msgid "Bundle is already being processed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3710
|
||||
#: documents/views.py:3711
|
||||
msgid "The share link bundle is still being prepared. Please try again later."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3720
|
||||
#: documents/views.py:3721
|
||||
msgid "The share link bundle is unavailable."
|
||||
msgstr ""
|
||||
|
||||
@@ -1862,151 +1866,151 @@ msgstr ""
|
||||
msgid "paperless application settings"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:521
|
||||
#: paperless/settings/__init__.py:518
|
||||
msgid "English (US)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:522
|
||||
#: paperless/settings/__init__.py:519
|
||||
msgid "Arabic"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:523
|
||||
#: paperless/settings/__init__.py:520
|
||||
msgid "Afrikaans"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:524
|
||||
#: paperless/settings/__init__.py:521
|
||||
msgid "Belarusian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:525
|
||||
#: paperless/settings/__init__.py:522
|
||||
msgid "Bulgarian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:526
|
||||
#: paperless/settings/__init__.py:523
|
||||
msgid "Catalan"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:527
|
||||
#: paperless/settings/__init__.py:524
|
||||
msgid "Czech"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:528
|
||||
#: paperless/settings/__init__.py:525
|
||||
msgid "Danish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:529
|
||||
#: paperless/settings/__init__.py:526
|
||||
msgid "German"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:530
|
||||
#: paperless/settings/__init__.py:527
|
||||
msgid "Greek"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:531
|
||||
#: paperless/settings/__init__.py:528
|
||||
msgid "English (GB)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:532
|
||||
#: paperless/settings/__init__.py:529
|
||||
msgid "Spanish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:533
|
||||
#: paperless/settings/__init__.py:530
|
||||
msgid "Persian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:534
|
||||
#: paperless/settings/__init__.py:531
|
||||
msgid "Finnish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:535
|
||||
#: paperless/settings/__init__.py:532
|
||||
msgid "French"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:536
|
||||
#: paperless/settings/__init__.py:533
|
||||
msgid "Hungarian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:537
|
||||
#: paperless/settings/__init__.py:534
|
||||
msgid "Indonesian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:538
|
||||
#: paperless/settings/__init__.py:535
|
||||
msgid "Italian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:539
|
||||
#: paperless/settings/__init__.py:536
|
||||
msgid "Japanese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:540
|
||||
#: paperless/settings/__init__.py:537
|
||||
msgid "Korean"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:541
|
||||
#: paperless/settings/__init__.py:538
|
||||
msgid "Luxembourgish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:542
|
||||
#: paperless/settings/__init__.py:539
|
||||
msgid "Norwegian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:543
|
||||
#: paperless/settings/__init__.py:540
|
||||
msgid "Dutch"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:544
|
||||
#: paperless/settings/__init__.py:541
|
||||
msgid "Polish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:545
|
||||
#: paperless/settings/__init__.py:542
|
||||
msgid "Portuguese (Brazil)"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:546
|
||||
#: paperless/settings/__init__.py:543
|
||||
msgid "Portuguese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:547
|
||||
#: paperless/settings/__init__.py:544
|
||||
msgid "Romanian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:548
|
||||
#: paperless/settings/__init__.py:545
|
||||
msgid "Russian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:549
|
||||
#: paperless/settings/__init__.py:546
|
||||
msgid "Slovak"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:550
|
||||
#: paperless/settings/__init__.py:547
|
||||
msgid "Slovenian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:551
|
||||
#: paperless/settings/__init__.py:548
|
||||
msgid "Serbian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:552
|
||||
#: paperless/settings/__init__.py:549
|
||||
msgid "Swedish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:553
|
||||
#: paperless/settings/__init__.py:550
|
||||
msgid "Turkish"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:554
|
||||
#: paperless/settings/__init__.py:551
|
||||
msgid "Ukrainian"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:555
|
||||
#: paperless/settings/__init__.py:552
|
||||
msgid "Vietnamese"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:556
|
||||
#: paperless/settings/__init__.py:553
|
||||
msgid "Chinese Simplified"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/settings/__init__.py:557
|
||||
#: paperless/settings/__init__.py:554
|
||||
msgid "Chinese Traditional"
|
||||
msgstr ""
|
||||
|
||||
@@ -2052,7 +2056,7 @@ msgid ""
|
||||
"process all matching rules that you have defined."
|
||||
msgstr ""
|
||||
|
||||
#: paperless_mail/apps.py:11
|
||||
#: paperless_mail/apps.py:8
|
||||
msgid "Paperless mail"
|
||||
msgstr ""
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import pwd
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
@@ -131,23 +132,14 @@ def settings_values_check(app_configs, **kwargs):
|
||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||
)
|
||||
|
||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
msgs.append(
|
||||
Warning(
|
||||
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||
"removed in a future version. Please use "
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||
),
|
||||
)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||
if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}:
|
||||
msgs.append(
|
||||
Error(
|
||||
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION setting "
|
||||
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
|
||||
),
|
||||
)
|
||||
|
||||
@@ -299,3 +291,97 @@ def check_deprecated_db_settings(
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_deprecated_v2_ocr_env_vars(
|
||||
app_configs: object,
|
||||
**kwargs: object,
|
||||
) -> list[Warning]:
|
||||
"""Warn when deprecated v2 OCR environment variables are set.
|
||||
|
||||
Users upgrading from v2 may still have these in their environment or
|
||||
config files, where they are now silently ignored.
|
||||
"""
|
||||
warnings: list[Warning] = []
|
||||
|
||||
if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"):
|
||||
warnings.append(
|
||||
Warning(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. "
|
||||
"Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.",
|
||||
id="paperless.W002",
|
||||
),
|
||||
)
|
||||
|
||||
ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "")
|
||||
if ocr_mode in {"skip", "skip_noarchive"}:
|
||||
warnings.append(
|
||||
Warning(
|
||||
f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. "
|
||||
f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never "
|
||||
f"if you used skip_noarchive) instead.",
|
||||
id="paperless.W003",
|
||||
),
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
proc = subprocess.run(
|
||||
[shutil.which("tesseract"), "--list-langs"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Decode bytes to string, split on newlines, trim out the header
|
||||
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||
|
||||
return [x.strip() for x in proc_lines]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
errs.append(
|
||||
Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english.",
|
||||
),
|
||||
)
|
||||
return errs
|
||||
|
||||
# binaries_check in paperless will check and report if this doesn't exist
|
||||
# So skip trying to do anything here and let that handle missing binaries
|
||||
if shutil.which("tesseract") is not None:
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
errs.append(
|
||||
Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||
),
|
||||
)
|
||||
|
||||
return errs
|
||||
|
||||
@@ -46,7 +46,7 @@ class OcrConfig(OutputTypeConfig):
|
||||
pages: int | None = dataclasses.field(init=False)
|
||||
language: str = dataclasses.field(init=False)
|
||||
mode: str = dataclasses.field(init=False)
|
||||
skip_archive_file: str = dataclasses.field(init=False)
|
||||
archive_file_generation: str = dataclasses.field(init=False)
|
||||
image_dpi: int | None = dataclasses.field(init=False)
|
||||
clean: str = dataclasses.field(init=False)
|
||||
deskew: bool = dataclasses.field(init=False)
|
||||
@@ -64,8 +64,8 @@ class OcrConfig(OutputTypeConfig):
|
||||
self.pages = app_config.pages or settings.OCR_PAGES
|
||||
self.language = app_config.language or settings.OCR_LANGUAGE
|
||||
self.mode = app_config.mode or settings.OCR_MODE
|
||||
self.skip_archive_file = (
|
||||
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
self.archive_file_generation = (
|
||||
app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION
|
||||
)
|
||||
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
|
||||
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
|
||||
|
||||
44
src/paperless/migrations/0008_replace_skip_archive_file.py
Normal file
44
src/paperless/migrations/0008_replace_skip_archive_file.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# Generated by Django 5.2.12 on 2026-03-26 20:31
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0007_optimize_integer_field_sizes"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="applicationconfiguration",
|
||||
name="skip_archive_file",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="archive_file_generation",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("auto", "auto"), ("always", "always"), ("never", "never")],
|
||||
max_length=8,
|
||||
null=True,
|
||||
verbose_name="Controls archive file generation",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="applicationconfiguration",
|
||||
name="mode",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("auto", "auto"),
|
||||
("force", "force"),
|
||||
("redo", "redo"),
|
||||
("off", "off"),
|
||||
],
|
||||
max_length=16,
|
||||
null=True,
|
||||
verbose_name="Sets the OCR mode",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
|
||||
and our own custom setting
|
||||
"""
|
||||
|
||||
SKIP = ("skip", _("skip"))
|
||||
REDO = ("redo", _("redo"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
FORCE = ("force", _("force"))
|
||||
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||
REDO = ("redo", _("redo"))
|
||||
OFF = ("off", _("off"))
|
||||
|
||||
|
||||
class ArchiveFileChoices(models.TextChoices):
|
||||
class ArchiveFileGenerationChoices(models.TextChoices):
|
||||
"""
|
||||
Settings to control creation of an archive PDF file
|
||||
"""
|
||||
|
||||
NEVER = ("never", _("never"))
|
||||
WITH_TEXT = ("with_text", _("with_text"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
NEVER = ("never", _("never"))
|
||||
|
||||
|
||||
class CleanChoices(models.TextChoices):
|
||||
@@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
choices=ModeChoices.choices,
|
||||
)
|
||||
|
||||
skip_archive_file = models.CharField(
|
||||
verbose_name=_("Controls the generation of an archive file"),
|
||||
archive_file_generation = models.CharField(
|
||||
verbose_name=_("Controls archive file generation"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=16,
|
||||
choices=ArchiveFileChoices.choices,
|
||||
max_length=8,
|
||||
choices=ArchiveFileGenerationChoices.choices,
|
||||
)
|
||||
|
||||
image_dpi = models.PositiveSmallIntegerField(
|
||||
|
||||
@@ -33,6 +33,7 @@ name, version, author, url, supported_mime_types (callable), score (callable).
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from importlib.metadata import entry_points
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -49,6 +50,7 @@ logger = logging.getLogger("paperless.parsers.registry")
|
||||
|
||||
_registry: ParserRegistry | None = None
|
||||
_discovery_complete: bool = False
|
||||
_lock = threading.Lock()
|
||||
|
||||
# Attribute names that every registered external parser class must expose.
|
||||
_REQUIRED_ATTRS: tuple[str, ...] = (
|
||||
@@ -74,7 +76,6 @@ def get_parser_registry() -> ParserRegistry:
|
||||
1. Creates a new ParserRegistry.
|
||||
2. Calls register_defaults to install built-in parsers.
|
||||
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
||||
4. Calls log_summary to emit a startup summary.
|
||||
|
||||
Subsequent calls return the same instance immediately.
|
||||
|
||||
@@ -85,14 +86,15 @@ def get_parser_registry() -> ParserRegistry:
|
||||
"""
|
||||
global _registry, _discovery_complete
|
||||
|
||||
if _registry is None:
|
||||
_registry = ParserRegistry()
|
||||
_registry.register_defaults()
|
||||
with _lock:
|
||||
if _registry is None:
|
||||
r = ParserRegistry()
|
||||
r.register_defaults()
|
||||
_registry = r
|
||||
|
||||
if not _discovery_complete:
|
||||
_registry.discover()
|
||||
_registry.log_summary()
|
||||
_discovery_complete = True
|
||||
if not _discovery_complete:
|
||||
_registry.discover()
|
||||
_discovery_complete = True
|
||||
|
||||
return _registry
|
||||
|
||||
@@ -113,9 +115,11 @@ def init_builtin_parsers() -> None:
|
||||
"""
|
||||
global _registry
|
||||
|
||||
if _registry is None:
|
||||
_registry = ParserRegistry()
|
||||
_registry.register_defaults()
|
||||
with _lock:
|
||||
if _registry is None:
|
||||
r = ParserRegistry()
|
||||
r.register_defaults()
|
||||
_registry = r
|
||||
|
||||
|
||||
def reset_parser_registry() -> None:
|
||||
@@ -304,6 +308,23 @@ class ParserRegistry:
|
||||
getattr(cls, "url", "unknown"),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Inspection helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||
"""Return all registered parser classes (external first, then builtins).
|
||||
|
||||
Used by compatibility wrappers that need to iterate every parser to
|
||||
compute the full set of supported MIME types and file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[type[ParserProtocol]]
|
||||
External parsers followed by built-in parsers.
|
||||
"""
|
||||
return [*self._external, *self._builtins]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Parser resolution
|
||||
# ------------------------------------------------------------------
|
||||
@@ -334,7 +355,7 @@ class ParserRegistry:
|
||||
mime_type:
|
||||
The detected MIME type of the file.
|
||||
filename:
|
||||
The original filename, including extension.
|
||||
The original filename, including extension. May be empty in some cases
|
||||
path:
|
||||
Optional filesystem path to the file. Forwarded to each
|
||||
parser's score method.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.resources
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -18,9 +19,11 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
@@ -250,36 +253,7 @@ class RasterisedDocumentParser:
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
@@ -289,6 +263,7 @@ class RasterisedDocumentParser:
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback: bool = False,
|
||||
skip_text: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
@@ -307,15 +282,14 @@ class RasterisedDocumentParser:
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
elif skip_text or self.settings.mode == ModeChoices.OFF:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
pass # no extra flag: normal OCR (text not found case)
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
@@ -400,6 +374,62 @@ class RasterisedDocumentParser:
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
||||
|
||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
||||
PDF/A-2b conformance metadata.
|
||||
|
||||
No Tesseract and no Ghostscript are invoked.
|
||||
"""
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
|
||||
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
|
||||
try:
|
||||
layout_fun = None
|
||||
if self.settings.image_dpi is not None:
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
|
||||
(self.settings.image_dpi, self.settings.image_dpi),
|
||||
)
|
||||
plain_pdf_path.write_bytes(
|
||||
img2pdf.convert(str(document_path), layout_fun=layout_fun),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
||||
) from e
|
||||
|
||||
icc_data = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
with pikepdf.open(plain_pdf_path) as pdf:
|
||||
cs = pdf.make_stream(icc_data)
|
||||
cs["/N"] = 3
|
||||
output_intent = pikepdf.Dictionary(
|
||||
Type=pikepdf.Name("/OutputIntent"),
|
||||
S=pikepdf.Name("/GTS_PDFA1"),
|
||||
OutputConditionIdentifier=pikepdf.String("sRGB"),
|
||||
DestOutputProfile=cs,
|
||||
)
|
||||
pdf.Root["/OutputIntents"] = pdf.make_indirect(
|
||||
pikepdf.Array([output_intent]),
|
||||
)
|
||||
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
|
||||
meta["pdfaid:part"] = "2"
|
||||
meta["pdfaid:conformance"] = "B"
|
||||
pdf.save(pdfa_path)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
|
||||
)
|
||||
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
|
||||
|
||||
return pdfa_path
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -409,57 +439,106 @@ class RasterisedDocumentParser:
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
|
||||
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
||||
if self.settings.mode == ModeChoices.OFF:
|
||||
if not produce_archive:
|
||||
self.text = text_original or ""
|
||||
return
|
||||
if self.is_image(mime_type):
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
mime_type,
|
||||
)
|
||||
self.text = ""
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"Image to PDF/A conversion failed: {e!s}",
|
||||
) from e
|
||||
return
|
||||
# PDFs in off mode: PDF/A conversion only via skip_text
|
||||
import ocrmypdf
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=True,
|
||||
)
|
||||
try:
|
||||
self.log.debug(
|
||||
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
|
||||
)
|
||||
ocrmypdf.ocr(**args)
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(None, archive_path) or text_original or ""
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
return
|
||||
|
||||
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
|
||||
if (
|
||||
self.settings.mode == ModeChoices.AUTO
|
||||
and original_has_text
|
||||
and not produce_archive
|
||||
):
|
||||
self.log.debug(
|
||||
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
|
||||
)
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
# --- All other paths: run ocrmypdf ---
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=skip_text,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
@@ -479,11 +558,10 @@ class RasterisedDocumentParser:
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
@@ -492,8 +570,6 @@ class RasterisedDocumentParser:
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
@@ -505,25 +581,18 @@ class RasterisedDocumentParser:
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path_fallback
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
|
||||
@@ -10,15 +10,104 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
||||
PDF_TEXT_MIN_LENGTH = 50
|
||||
|
||||
|
||||
def is_tagged_pdf(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> bool:
|
||||
"""Return True if the PDF declares itself as tagged (born-digital indicator).
|
||||
|
||||
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
|
||||
with ``/Marked true`` in the document root. This is a reliable signal
|
||||
that the document has a logical structure and embedded text — running OCR
|
||||
on it is unnecessary and archive generation can be skipped.
|
||||
|
||||
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with pikepdf.open(path) as pdf:
|
||||
mark_info = pdf.Root.get("/MarkInfo")
|
||||
if mark_info is None:
|
||||
return False
|
||||
return bool(mark_info.get("/Marked", False))
|
||||
except Exception:
|
||||
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def extract_pdf_text(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str | None:
|
||||
"""Run pdftotext on *path* and return the extracted text, or None on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
|
||||
"""
|
||||
from documents.utils import run_subprocess
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
out_path = Path(tmpdir) / "text.txt"
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(path),
|
||||
str(out_path),
|
||||
],
|
||||
logger=_log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(out_path, log=_log)
|
||||
return text or None
|
||||
except Exception:
|
||||
_log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
|
||||
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
|
||||
from paperless.settings.custom import parse_ignore_dates
|
||||
from paperless.settings.custom import parse_redis_url
|
||||
from paperless.settings.parsers import get_bool_from_env
|
||||
from paperless.settings.parsers import get_choice_from_env
|
||||
from paperless.settings.parsers import get_float_from_env
|
||||
from paperless.settings.parsers import get_int_from_env
|
||||
from paperless.settings.parsers import get_list_from_env
|
||||
@@ -121,10 +122,7 @@ INSTALLED_APPS = [
|
||||
"django_extensions",
|
||||
"paperless",
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@@ -877,10 +875,17 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
# OCRmyPDF --output-type options are available.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# skip. redo, force
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
OCR_MODE = get_choice_from_env(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
{"auto", "force", "redo", "off"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||
ARCHIVE_FILE_GENERATION = get_choice_from_env(
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION",
|
||||
{"auto", "always", "never"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
@@ -974,8 +979,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
"http://localhost:3000",
|
||||
)
|
||||
|
||||
if TIKA_ENABLED:
|
||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||
# Tika parser is now integrated into the main parser registry
|
||||
# No separate Django app needed
|
||||
|
||||
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||
if AUDIT_LOG_ENABLED:
|
||||
|
||||
@@ -90,35 +90,6 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the remote parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/remote/``
|
||||
"""
|
||||
return samples_dir / "remote"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``remote/simple-digital.pdf``.
|
||||
"""
|
||||
return remote_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser instance
|
||||
# ------------------------------------------------------------------
|
||||
@@ -737,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
skip_archive_file=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
|
||||
436
src/paperless/tests/parsers/test_parse_modes.py
Normal file
436
src/paperless/tests/parsers/test_parse_modes.py
Normal file
@@ -0,0 +1,436 @@
|
||||
"""
|
||||
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
|
||||
|
||||
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
|
||||
installation and execute quickly. The intent is to verify the *control flow*
|
||||
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
|
||||
not to test OCRmyPDF itself.
|
||||
|
||||
Fixtures are pulled from conftest.py in this package.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
|
||||
_SHORT_TEXT = "Hi." # <50 chars
|
||||
|
||||
|
||||
def _make_extract_text(text: str | None):
|
||||
"""Return a side_effect function for ``extract_text`` that returns *text*."""
|
||||
|
||||
def _extract(sidecar_file, pdf_file):
|
||||
return text
|
||||
|
||||
return _extract
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF with sufficient text layer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeWithText:
|
||||
"""AUTO mode, original PDF has detectable text (>50 chars)."""
|
||||
|
||||
def test_auto_text_no_archive_skips_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called (early return path)
|
||||
- archive_path remains None
|
||||
- text is set from the original
|
||||
"""
|
||||
# Patch extract_text to return long text (simulating detectable text layer)
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF without text layer (or too short)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeNoText:
|
||||
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
|
||||
|
||||
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with no text (or text <= VALID_TEXT_LENGTH)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
|
||||
- archive_path is set (since produce_archive=True)
|
||||
"""
|
||||
# Return "no text" for the original; return real text for archive
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None # original has no text
|
||||
return _LONG_TEXT # text from archive after OCR
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert "skip_text" not in call_kwargs
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
def test_auto_no_text_no_archive_calls_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with no text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called (no early return since no text detected)
|
||||
- archive_path is NOT set (produce_archive=False)
|
||||
"""
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None
|
||||
return _LONG_TEXT
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModePdf:
|
||||
"""OCR_MODE=off, document is a PDF."""
|
||||
|
||||
def test_off_no_archive_returns_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- PDF with text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text comes from pdftotext (extract_text)
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_off_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- PDF document
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — image
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModeImage:
|
||||
"""OCR_MODE=off, document is an image (PNG)."""
|
||||
|
||||
def test_off_image_no_archive_no_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text is empty string (images have no text layer)
|
||||
"""
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
def test_off_image_with_archive_uses_img2pdf_path(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
|
||||
- archive_path is set to the returned path
|
||||
- text is empty string
|
||||
"""
|
||||
fake_archive = Path("/tmp/fake-archive.pdf")
|
||||
mock_convert = mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"_convert_image_to_pdfa",
|
||||
return_value=fake_archive,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
||||
|
||||
mock_convert.assert_called_once_with(simple_png_file, "image/png")
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path == fake_archive
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProduceArchiveFalse:
|
||||
"""Verify produce_archive=False never results in an archive regardless of mode."""
|
||||
|
||||
@pytest.mark.parametrize("mode", ["force", "redo"])
|
||||
def test_produce_archive_false_force_redo_modes(
|
||||
self,
|
||||
mode: str,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- FORCE or REDO mode, produce_archive=False
|
||||
- Any PDF
|
||||
WHEN:
|
||||
- parse() is called (ocrmypdf mocked to succeed)
|
||||
THEN:
|
||||
- archive_path is NOT set even though ocrmypdf ran
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = mode
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() is not None
|
||||
|
||||
def test_produce_archive_false_auto_with_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf is skipped entirely (early return)
|
||||
- archive_path is None
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
@@ -277,20 +277,20 @@ class TestRemoteParserParse:
|
||||
def test_parse_returns_text_from_azure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||
|
||||
def test_parse_sets_archive_path(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
archive = remote_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
@@ -300,11 +300,11 @@ class TestRemoteParserParse:
|
||||
def test_parse_closes_client_on_success(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.configure(ParserContext())
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
azure_client.close.assert_called_once()
|
||||
|
||||
@@ -312,9 +312,9 @@ class TestRemoteParserParse:
|
||||
def test_parse_sets_empty_text_when_not_configured(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
@@ -328,10 +328,10 @@ class TestRemoteParserParse:
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_date() is None
|
||||
|
||||
@@ -345,33 +345,33 @@ class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
failing_azure_client.close.assert_called_once()
|
||||
|
||||
def test_parse_logs_error_on_azure_failure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
mock_log.error.assert_called_once()
|
||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||
@@ -386,18 +386,18 @@ class TestRemoteParserPageCount:
|
||||
def test_page_count_for_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
count = remote_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count >= 1
|
||||
|
||||
def test_page_count_returns_none_for_image_mime(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||
count = remote_parser.get_page_count(simple_digital_pdf_file, "image/png")
|
||||
assert count is None
|
||||
|
||||
def test_page_count_returns_none_for_invalid_pdf(
|
||||
@@ -420,25 +420,31 @@ class TestRemoteParserMetadata:
|
||||
def test_extract_metadata_non_pdf_returns_empty(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||
result = remote_parser.extract_metadata(simple_digital_pdf_file, "image/png")
|
||||
assert result == []
|
||||
|
||||
def test_extract_metadata_pdf_returns_list(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
result = remote_parser.extract_metadata(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
)
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
result = remote_parser.extract_metadata(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
)
|
||||
for entry in result:
|
||||
assert "namespace" in entry
|
||||
assert "prefix" in entry
|
||||
|
||||
@@ -89,15 +89,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
WHEN:
|
||||
- OCR parameters are constructed
|
||||
THEN:
|
||||
- Configuration from database is utilized
|
||||
- Configuration from database is utilized (AUTO mode with skip_text=True
|
||||
triggers skip_text; AUTO mode alone does not add any extra flag)
|
||||
"""
|
||||
# AUTO mode with skip_text=True explicitly passed: skip_text is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
skip_text=True,
|
||||
)
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
# AUTO mode alone (no skip_text): no extra OCR flag is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("skip_text", params)
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
|
||||
@@ -370,15 +370,26 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page digital PDF with sufficient text layer
|
||||
- Default settings (mode=auto, produce_archive=True)
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Archive is created (AUTO mode + text present + produce_archive=True
|
||||
→ PDF/A conversion via skip_text)
|
||||
- Text is extracted
|
||||
"""
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.archive_path.is_file()
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text(),
|
||||
["This is a test document."],
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_with_form_default(
|
||||
@@ -397,7 +408,7 @@ class TestParsePdf:
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
def test_with_form_redo_produces_no_archive(
|
||||
def test_with_form_redo_no_archive_when_not_requested(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
@@ -406,6 +417,7 @@ class TestParsePdf:
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "with-form.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -433,7 +445,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -449,7 +461,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "encrypted.pdf",
|
||||
"application/pdf",
|
||||
@@ -559,7 +571,7 @@ class TestParseMultiPage:
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[
|
||||
pytest.param("skip", id="skip"),
|
||||
pytest.param("auto", id="auto"),
|
||||
pytest.param("redo", id="redo"),
|
||||
pytest.param("force", id="force"),
|
||||
],
|
||||
@@ -587,7 +599,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -735,16 +747,18 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; no archive created
|
||||
- Text extracted from original; no archive created (text exists +
|
||||
produce_archive=False skips OCRmyPDF entirely)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -760,13 +774,13 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; archive created (OCR needed)
|
||||
- Text extracted; archive created (OCR needed, no existing text)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -778,41 +792,58 @@ class TestSkipArchive:
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
("produce_archive", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
True,
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="with-text-layer",
|
||||
True,
|
||||
id="produce-archive-with-text",
|
||||
),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
True,
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="with-text-no-layer",
|
||||
id="produce-archive-no-text",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="always-with-text",
|
||||
id="no-archive-with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"multi-page-images.pdf",
|
||||
False,
|
||||
id="no-archive-no-text-layer",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||
],
|
||||
)
|
||||
def test_skip_archive_file_setting(
|
||||
def test_produce_archive_flag(
|
||||
self,
|
||||
skip_archive_file: str,
|
||||
produce_archive: bool, # noqa: FBT001
|
||||
filename: str,
|
||||
expect_archive: str,
|
||||
expect_archive: bool, # noqa: FBT001
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.skip_archive_file = skip_archive_file
|
||||
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||
"""
|
||||
GIVEN:
|
||||
- Various PDFs (with and without text layers)
|
||||
- produce_archive flag set to True or False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- archive_path is set if and only if produce_archive=True
|
||||
- Text is always extracted
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / filename,
|
||||
"application/pdf",
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
text = tesseract_parser.get_text().lower()
|
||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||
if expect_archive:
|
||||
@@ -820,6 +851,59 @@ class TestSkipArchive:
|
||||
else:
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
def test_tagged_pdf_skips_ocr_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
|
||||
- Text is extracted from the original via pdftotext
|
||||
- No archive is produced
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
|
||||
- Archive is produced
|
||||
- Text is preserved from the original
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse — mixed pages / sidecar
|
||||
@@ -835,13 +919,13 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in some pages (image) and some pages (digital)
|
||||
- Mode: skip
|
||||
- Mode: auto (skip_text), skip_archive_file: always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- All pages extracted; archive created; sidecar notes skipped pages
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -898,17 +982,18 @@ class TestParseMixed:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages
|
||||
- Mode: skip_noarchive
|
||||
- File with mixed pages (some with text, some image-only)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (file has text layer); later-page text present
|
||||
- No archive created (produce_archive=False); text from text layer present
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -923,12 +1008,12 @@ class TestParseMixed:
|
||||
|
||||
|
||||
class TestParseRotate:
|
||||
def test_rotate_skip_mode(
|
||||
def test_rotate_auto_mode(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.rotate = True
|
||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||
assert_ordered_substrings(
|
||||
@@ -955,12 +1040,19 @@ class TestParseRtl:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF with RTL Arabic text
|
||||
- PDF with RTL Arabic text in its text layer (short: 18 chars)
|
||||
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Arabic content is extracted (normalised for bidi)
|
||||
- Arabic content is extracted from the PDF text layer (normalised for bidi)
|
||||
|
||||
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
|
||||
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
|
||||
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
|
||||
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "rtl-test.pdf",
|
||||
"application/pdf",
|
||||
@@ -1023,11 +1115,11 @@ class TestOcrmypdfParameters:
|
||||
assert ("clean" in params) == expected_clean
|
||||
assert ("clean_final" in params) == expected_clean_final
|
||||
|
||||
def test_clean_final_skip_mode(
|
||||
def test_clean_final_auto_mode(
|
||||
self,
|
||||
make_tesseract_parser: MakeTesseractParser,
|
||||
) -> None:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
assert params["clean_final"] is True
|
||||
assert "clean" not in params
|
||||
@@ -1044,9 +1136,9 @@ class TestOcrmypdfParameters:
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
||||
[
|
||||
pytest.param("skip", True, True, id="skip-deskew-on"),
|
||||
pytest.param("auto", True, True, id="auto-deskew-on"),
|
||||
pytest.param("redo", True, False, id="redo-deskew-off"),
|
||||
pytest.param("skip", False, False, id="skip-no-deskew"),
|
||||
pytest.param("auto", False, False, id="auto-no-deskew"),
|
||||
],
|
||||
)
|
||||
def test_deskew_option(
|
||||
|
||||
@@ -77,10 +77,10 @@ class TestTikaParserRegistryInterface:
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
tika_parser._archive_path = sample_pdf_file
|
||||
count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
tika_parser._archive_path = simple_digital_pdf_file
|
||||
count = tika_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
Binary file not shown.
@@ -5,6 +5,7 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.core.checks import ERROR
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
@@ -12,7 +13,9 @@ from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.checks import audit_log_check
|
||||
from paperless.checks import binaries_check
|
||||
from paperless.checks import check_default_language_available
|
||||
from paperless.checks import check_deprecated_db_settings
|
||||
from paperless.checks import check_remote_parser_configured
|
||||
from paperless.checks import check_v3_minimum_upgrade_version
|
||||
from paperless.checks import debug_mode_check
|
||||
from paperless.checks import paths_check
|
||||
@@ -129,13 +132,13 @@ class TestOcrSettingsChecks:
|
||||
pytest.param(
|
||||
"OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"deprecated",
|
||||
id="deprecated-mode",
|
||||
'OCR output mode "skip_noarchive"',
|
||||
id="deprecated-mode-now-invalid",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_SKIP_ARCHIVE_FILE",
|
||||
"ARCHIVE_FILE_GENERATION",
|
||||
"invalid",
|
||||
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
|
||||
'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"',
|
||||
id="invalid-skip-archive-file",
|
||||
),
|
||||
pytest.param(
|
||||
@@ -626,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck:
|
||||
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
|
||||
mocker.patch.dict("paperless.checks.connections", {"default": conn})
|
||||
assert check_v3_minimum_upgrade_version(None) == []
|
||||
|
||||
|
||||
class TestRemoteParserChecks:
|
||||
def test_no_engine(self, settings: SettingsWrapper) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "somekey"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"Azure AI remote parser requires endpoint and API key to be configured."
|
||||
in msg.msg
|
||||
)
|
||||
|
||||
|
||||
class TestTesseractChecks:
|
||||
def test_default_language(self) -> None:
|
||||
check_default_language_available(None)
|
||||
|
||||
def test_no_language(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = ""
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg
|
||||
)
|
||||
|
||||
def test_invalid_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = "ita"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["deu", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language ita is not installed" in msg.msg
|
||||
|
||||
def test_multi_part_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
|
||||
settings.OCR_LANGUAGE = "chi_sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_multi_part_language_bad_format(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly NOT formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
settings.OCR_LANGUAGE = "chi-sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language chi-sim is not installed" in msg.msg
|
||||
|
||||
64
src/paperless/tests/test_checks_v3.py
Normal file
64
src/paperless/tests/test_checks_v3.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Tests for v3 system checks: deprecated v2 OCR env var warnings."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.checks import check_deprecated_v2_ocr_env_vars
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
class TestDeprecatedV2OcrEnvVarWarnings:
|
||||
def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None:
|
||||
"""No warnings when neither deprecated variable is set."""
|
||||
mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("env_var", "env_value", "expected_id", "expected_fragment"),
|
||||
[
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
"always",
|
||||
"paperless.W002",
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
id="skip-archive-file-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip",
|
||||
"paperless.W003",
|
||||
"skip",
|
||||
id="ocr-mode-skip-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"paperless.W003",
|
||||
"skip_noarchive",
|
||||
id="ocr-mode-skip-noarchive-warns",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_deprecated_var_produces_one_warning(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
env_var: str,
|
||||
env_value: str,
|
||||
expected_id: str,
|
||||
expected_fragment: str,
|
||||
) -> None:
|
||||
"""Each deprecated setting in isolation produces exactly one warning."""
|
||||
mocker.patch.dict(os.environ, {env_var: env_value}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
|
||||
assert len(result) == 1
|
||||
warning = result[0]
|
||||
assert warning.id == expected_id
|
||||
assert expected_fragment in warning.msg
|
||||
66
src/paperless/tests/test_ocr_config.py
Normal file
66
src/paperless/tests/test_ocr_config.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Tests for OcrConfig archive_file_generation field behavior."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless.config import OcrConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_ocr_config(mocker, null_app_config):
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
def _make(**django_settings_overrides):
|
||||
with override_settings(**django_settings_overrides):
|
||||
return OcrConfig()
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
class TestOcrConfigArchiveFileGeneration:
|
||||
def test_auto_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto")
|
||||
assert cfg.archive_file_generation == "auto"
|
||||
|
||||
def test_always_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "always"
|
||||
|
||||
def test_never_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
|
||||
def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None:
|
||||
null_app_config.archive_file_generation = "never"
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
25
src/paperless/tests/test_parser_utils.py
Normal file
25
src/paperless/tests/test_parser_utils.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Tests for paperless.parsers.utils helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
||||
|
||||
|
||||
class TestIsTaggedPdf:
|
||||
def test_tagged_pdf_returns_true(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
||||
|
||||
def test_untagged_pdf_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
|
||||
|
||||
def test_nonexistent_path_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
|
||||
|
||||
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
|
||||
bad = tmp_path / "bad.pdf"
|
||||
bad.write_bytes(b"not a pdf")
|
||||
assert is_tagged_pdf(bad) is False
|
||||
@@ -1,18 +1,8 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from paperless_mail.signals import mail_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessMailConfig(AppConfig):
|
||||
name = "paperless_mail"
|
||||
|
||||
verbose_name = _("Paperless mail")
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
if settings.TIKA_ENABLED:
|
||||
document_consumer_declaration.connect(mail_consumer_declaration)
|
||||
AppConfig.ready(self)
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
# MailDocumentParser accepts no constructor args in the new-style protocol.
|
||||
# Pop legacy args that arrive from the signal-based consumer path.
|
||||
# Phase 4 will replace this signal path with the ParserRegistry.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return MailDocumentParser()
|
||||
|
||||
|
||||
def mail_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 20,
|
||||
"mime_types": {
|
||||
"message/rfc822": ".eml",
|
||||
},
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
{% autoescape off %}
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
@@ -13,36 +12,34 @@
|
||||
<!-- Header -->
|
||||
<div class="grid gap-x-2 bg-slate-200 p-4">
|
||||
|
||||
<div class="col-start-9 col-span-4 row-start-1 text-right">{{ date }}</div>
|
||||
<div class="col-start-9 col-span-4 row-start-1 text-right">{{ date|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-1 text-slate-400 text-right">{{ from_label }}</div>
|
||||
<div class="col-start-2 col-span-7 row-start-1">{{ from }}</div>
|
||||
<div class="col-start-2 col-span-7 row-start-1">{{ from|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-2 text-slate-400 text-right">{{ subject_label }}</div>
|
||||
<div class=" col-start-2 col-span-10 row-start-2 font-bold">{{ subject }}</div>
|
||||
<div class=" col-start-2 col-span-10 row-start-2 font-bold">{{ subject|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-3 text-slate-400 text-right">{{ to_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-3 text-sm my-0.5">{{ to }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-3 text-sm my-0.5">{{ to|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-4 text-slate-400 text-right">{{ cc_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-4 text-sm my-0.5">{{ cc }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-4 text-sm my-0.5">{{ cc|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-5 text-slate-400 text-right">{{ bcc_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-5" text-sm my-0.5>{{ bcc }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-5" text-sm my-0.5>{{ bcc|safe }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-6 text-slate-400 text-right">{{ attachments_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-6">{{ attachments }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start-6">{{ attachments|safe }}</div>
|
||||
</div>
|
||||
|
||||
<!-- Separator-->
|
||||
<div class="border-t border-solid border-b w-full h-[1px] box-content border-black mb-5 bg-slate-200"></div>
|
||||
|
||||
<!-- Content-->
|
||||
<div class="w-full break-words">{{ content }}</div>
|
||||
<div class="w-full break-words">{{ content|safe }}</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
{% endautoescape %}
|
||||
|
||||
@@ -191,7 +191,10 @@ class TestMailOAuth(
|
||||
).exists(),
|
||||
)
|
||||
|
||||
self.assertIn("Error getting access token: test_error", cm.output[0])
|
||||
self.assertIn(
|
||||
"Error getting access token from OAuth provider",
|
||||
cm.output[0],
|
||||
)
|
||||
|
||||
def test_oauth_callback_view_insufficient_permissions(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -138,13 +138,16 @@ class MailAccountViewSet(ModelViewSet, PassUserMixin):
|
||||
existing_account.refresh_from_db()
|
||||
account.password = existing_account.password
|
||||
else:
|
||||
logger.error(
|
||||
"Mail account connectivity test failed: Unable to refresh oauth token",
|
||||
)
|
||||
raise MailError("Unable to refresh oauth token")
|
||||
|
||||
mailbox_login(M, account)
|
||||
return Response({"success": True})
|
||||
except MailError as e:
|
||||
except MailError:
|
||||
logger.error(
|
||||
f"Mail account {account} test failed: {e}",
|
||||
"Mail account connectivity test failed",
|
||||
)
|
||||
return HttpResponseBadRequest("Unable to connect to server")
|
||||
|
||||
@@ -218,7 +221,7 @@ class OauthCallbackView(GenericAPIView):
|
||||
|
||||
if code is None:
|
||||
logger.error(
|
||||
f"Invalid oauth callback request, code: {code}, scope: {scope}",
|
||||
"Invalid oauth callback request: missing code",
|
||||
)
|
||||
return HttpResponseBadRequest("Invalid request, see logs for more detail")
|
||||
|
||||
@@ -229,7 +232,7 @@ class OauthCallbackView(GenericAPIView):
|
||||
state = request.query_params.get("state", "")
|
||||
if not oauth_manager.validate_state(state):
|
||||
logger.error(
|
||||
f"Invalid oauth callback request received state: {state}, expected: {oauth_manager.state}",
|
||||
"Invalid oauth callback request: state validation failed",
|
||||
)
|
||||
return HttpResponseBadRequest("Invalid request, see logs for more detail")
|
||||
|
||||
@@ -276,8 +279,8 @@ class OauthCallbackView(GenericAPIView):
|
||||
return HttpResponseRedirect(
|
||||
f"{oauth_manager.oauth_redirect_url}?oauth_success=1&account_id={account.pk}",
|
||||
)
|
||||
except GetAccessTokenError as e:
|
||||
logger.error(f"Error getting access token: {e}")
|
||||
except GetAccessTokenError:
|
||||
logger.error("Error getting access token from OAuth provider")
|
||||
return HttpResponseRedirect(
|
||||
f"{oauth_manager.oauth_redirect_url}?oauth_success=0",
|
||||
)
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_remote.checks import check_remote_parser_configured
|
||||
|
||||
__all__ = ["check_remote_parser_configured"]
|
||||
@@ -1,14 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_remote.signals import remote_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessRemoteParserConfig(AppConfig):
|
||||
name = "paperless_remote"
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
@@ -1,17 +0,0 @@
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs):
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
@@ -1,38 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
# The new RemoteDocumentParser does not accept the progress_callback
|
||||
# kwarg injected by the old signal-based consumer. logging_group is
|
||||
# forwarded as a positional arg.
|
||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types() -> dict[str, str]:
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.remote import RemoteEngineConfig
|
||||
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return {}
|
||||
return RemoteDocumentParser.supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
"mime_types": get_supported_mime_types(),
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user