mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-02 22:28:51 +00:00
Compare commits
28 Commits
feature-ar
...
feature-ta
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a2ae2059c | ||
|
|
576adad9ae | ||
|
|
50de8b5721 | ||
|
|
44ef454065 | ||
|
|
b3851c8bc5 | ||
|
|
fc950a27ca | ||
|
|
068188f549 | ||
|
|
9248733ed1 | ||
|
|
c16d23fe21 | ||
|
|
134b758c84 | ||
|
|
f40442c61e | ||
|
|
5adfa49d00 | ||
|
|
c06687d070 | ||
|
|
8c737f41c0 | ||
|
|
20d43936b3 | ||
|
|
87728f3448 | ||
|
|
804b5ed99d | ||
|
|
f13f6a132c | ||
|
|
66ad082f8e | ||
|
|
357f462e82 | ||
|
|
b40721ae41 | ||
|
|
75afbd1a3c | ||
|
|
9df15416dc | ||
|
|
0ff1a7809e | ||
|
|
32dbb2438b | ||
|
|
442d049a57 | ||
|
|
e8c39e83fc | ||
|
|
e6a334878c |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -111,4 +111,3 @@ celerybeat-schedule*
|
||||
|
||||
# ignore pnpm package store folder created when setting up the devcontainer
|
||||
.pnpm-store/
|
||||
.worktrees
|
||||
|
||||
10
docs/api.md
10
docs/api.md
@@ -62,10 +62,14 @@ The REST api provides five different forms of authentication.
|
||||
|
||||
## Searching for documents
|
||||
|
||||
Full text searching is available on the `/api/documents/` endpoint. Two
|
||||
specific query parameters cause the API to return full text search
|
||||
Full text searching is available on the `/api/documents/` endpoint. The
|
||||
following query parameters cause the API to return Tantivy-backed search
|
||||
results:
|
||||
|
||||
- `/api/documents/?text=your%20search%20query`: Search title and content
|
||||
using simple substring-style search.
|
||||
- `/api/documents/?title_search=your%20search%20query`: Search title only
|
||||
using simple substring-style search.
|
||||
- `/api/documents/?query=your%20search%20query`: Search for a document
|
||||
using a full text query. For details on the syntax, see [Basic Usage - Searching](usage.md#basic-usage_searching).
|
||||
- `/api/documents/?more_like_id=1234`: Search for documents similar to
|
||||
@@ -439,3 +443,5 @@ Initial API version.
|
||||
- The `all` parameter of list endpoints is now deprecated and will be removed in a future version.
|
||||
- The bulk edit objects endpoint now supports `all` and `filters` parameters to avoid having to send
|
||||
large lists of object IDs for operations affecting many objects.
|
||||
- The legacy `title_content` document search parameter is deprecated and will be removed in a future version.
|
||||
Clients should use `text` for simple title-and-content search and `title_search` for title-only search.
|
||||
|
||||
@@ -801,14 +801,11 @@ parsing documents.
|
||||
|
||||
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
||||
|
||||
: Tell paperless when and how to perform ocr on your documents. Four
|
||||
: Tell paperless when and how to perform ocr on your documents. Three
|
||||
modes are available:
|
||||
|
||||
- `auto` (default): Paperless detects whether a document already
|
||||
has embedded text via pdftotext. If sufficient text is found,
|
||||
OCR is skipped for that document (`--skip-text`). If no text is
|
||||
present, OCR runs normally. This is the safest option for mixed
|
||||
document collections.
|
||||
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||
pages where no text is present. This is the safest option.
|
||||
|
||||
- `redo`: Paperless will OCR all pages of your documents and
|
||||
attempt to replace any existing text layers with new text. This
|
||||
@@ -826,59 +823,24 @@ modes are available:
|
||||
significantly larger and text won't appear as sharp when zoomed
|
||||
in.
|
||||
|
||||
- `off`: Paperless never invokes the OCR engine. For PDFs, text
|
||||
is extracted via pdftotext only. For image documents, text will
|
||||
be empty. Archive file generation still works via format
|
||||
conversion (no Tesseract or Ghostscript required).
|
||||
The default is `skip`, which only performs OCR when necessary and
|
||||
always creates archived documents.
|
||||
|
||||
The default is `auto`.
|
||||
|
||||
For the `skip`, `redo`, and `force` modes, read more about OCR
|
||||
behaviour in the [OCRmyPDF
|
||||
Read more about this in the [OCRmyPDF
|
||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||
|
||||
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
|
||||
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
|
||||
|
||||
: Controls when paperless creates a PDF/A archive version of your
|
||||
documents. Archive files are stored alongside the original and are used
|
||||
for display in the web interface.
|
||||
: Specify when you would like paperless to skip creating an archived
|
||||
version of your documents. This is useful if you don't want to have two
|
||||
almost-identical versions of your documents in the media folder.
|
||||
|
||||
- `auto` (default): Produce archives for scanned or image-based
|
||||
documents. Skip archive generation for born-digital PDFs that
|
||||
already contain embedded text. This is the recommended setting
|
||||
for mixed document collections.
|
||||
- `always`: Always produce a PDF/A archive when the parser
|
||||
supports it, regardless of whether the document already has
|
||||
text.
|
||||
- `never`: Never produce an archive. Only the original file is
|
||||
stored. Saves disk space but the web viewer will display the
|
||||
original file directly.
|
||||
- `never`: Never skip creating an archived version.
|
||||
- `with_text`: Skip creating an archived version for documents
|
||||
that already have embedded text.
|
||||
- `always`: Always skip creating an archived version.
|
||||
|
||||
**Behaviour by file type and mode** (`auto` column shows the default):
|
||||
|
||||
| Document type | `never` | `auto` (default) | `always` |
|
||||
| -------------------------- | ------- | -------------------------- | -------- |
|
||||
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
|
||||
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
|
||||
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
|
||||
| Plain text, email, HTML | No | No | No |
|
||||
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |
|
||||
|
||||
\* Tika always produces a PDF rendition for display; this counts as
|
||||
the archive regardless of the setting.
|
||||
|
||||
!!! note
|
||||
|
||||
This setting applies to the built-in Tesseract parser. Parsers
|
||||
that must always convert documents to PDF for display (e.g. DOCX,
|
||||
ODT via Tika) will produce a PDF regardless of this setting.
|
||||
|
||||
!!! note
|
||||
|
||||
The **remote OCR parser** (Azure AI) always produces a searchable
|
||||
PDF and stores it as the archive copy, regardless of this setting.
|
||||
`ARCHIVE_FILE_GENERATION=never` has no effect when the remote
|
||||
parser handles a document.
|
||||
The default is `never`.
|
||||
|
||||
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
||||
|
||||
|
||||
@@ -104,64 +104,7 @@ Multiple options are combined in a single value:
|
||||
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
||||
```
|
||||
|
||||
## OCR and Archive File Generation Settings
|
||||
|
||||
The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — old values are not silently honoured; a startup warning is logged if any removed variable is still set in your environment.
|
||||
|
||||
### Removed settings
|
||||
|
||||
| Removed Setting | Replacement |
|
||||
| ------------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) |
|
||||
| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
|
||||
### What changed and why
|
||||
|
||||
Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives.
|
||||
|
||||
The new settings are independent:
|
||||
|
||||
- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`.
|
||||
- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`.
|
||||
|
||||
### Action Required
|
||||
|
||||
Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly:
|
||||
|
||||
```bash
|
||||
# v2: skip OCR when text present, always archive
|
||||
PAPERLESS_OCR_MODE=skip
|
||||
# v3: equivalent (auto is the new default)
|
||||
# No change needed — auto is the default
|
||||
|
||||
# v2: skip OCR when text present, skip archive too
|
||||
PAPERLESS_OCR_MODE=skip_noarchive
|
||||
# v3: equivalent
|
||||
PAPERLESS_OCR_MODE=auto
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: always skip archive
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always
|
||||
# v3: equivalent
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: skip archive only for born-digital docs
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text
|
||||
# v3: equivalent (auto is the new default)
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=auto
|
||||
```
|
||||
|
||||
### Remote OCR parser
|
||||
|
||||
If you use the **remote OCR parser** (Azure AI), note that it always produces a
|
||||
searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never`
|
||||
has no effect for documents handled by the remote parser — the archive is produced
|
||||
unconditionally by the remote engine.
|
||||
|
||||
# Search Index (Whoosh -> Tantivy)
|
||||
## Search Index (Whoosh -> Tantivy)
|
||||
|
||||
The full-text search backend has been replaced with [Tantivy](https://github.com/quickwit-oss/tantivy).
|
||||
The index format is incompatible with Whoosh, so **the search index is automatically rebuilt from
|
||||
|
||||
@@ -633,11 +633,12 @@ hardware, but a few settings can improve performance:
|
||||
consumption, so you might want to lower these settings (example: 2
|
||||
workers and 1 thread to always have some computing power left for
|
||||
other tasks).
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider
|
||||
OCRing your documents before feeding them into Paperless. Some
|
||||
scanners are able to do this!
|
||||
- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive
|
||||
file generation entirely, saving disk space at the cost of in-browser PDF/A viewing.
|
||||
- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive
|
||||
file generation for already OCRed documents, or `always` to skip it
|
||||
for all documents.
|
||||
- If you want to perform OCR on the device, consider using
|
||||
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
||||
less memory at the expense of slightly worse OCR results.
|
||||
|
||||
@@ -134,9 +134,9 @@ following operations on your documents:
|
||||
!!! tip
|
||||
|
||||
This process can be configured to fit your needs. If you don't want
|
||||
paperless to create archived versions for born-digital documents, set
|
||||
[`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION)
|
||||
(the default). To skip archives entirely, use `never`. Please read the
|
||||
paperless to create archived versions for digital documents, you can
|
||||
configure that by configuring
|
||||
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
|
||||
[relevant section in the documentation](configuration.md#ocr).
|
||||
|
||||
!!! note
|
||||
|
||||
@@ -49,11 +49,11 @@ test('text filtering', async ({ page }) => {
|
||||
await page.getByRole('main').getByRole('combobox').click()
|
||||
await page.getByRole('main').getByRole('combobox').fill('test')
|
||||
await expect(page.locator('pngx-document-list')).toHaveText(/32 documents/)
|
||||
await expect(page).toHaveURL(/title_content=test/)
|
||||
await expect(page).toHaveURL(/text=test/)
|
||||
await page.getByRole('button', { name: 'Title & content' }).click()
|
||||
await page.getByRole('button', { name: 'Title', exact: true }).click()
|
||||
await expect(page.locator('pngx-document-list')).toHaveText(/9 documents/)
|
||||
await expect(page).toHaveURL(/title__icontains=test/)
|
||||
await expect(page).toHaveURL(/title_search=test/)
|
||||
await page.getByRole('button', { name: 'Title', exact: true }).click()
|
||||
await page.getByRole('button', { name: 'Advanced search' }).click()
|
||||
await expect(page).toHaveURL(/query=test/)
|
||||
|
||||
@@ -3545,7 +3545,7 @@
|
||||
"time": 1.091,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_content=test",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&text=test",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
@@ -3579,7 +3579,7 @@
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "title_content",
|
||||
"name": "text",
|
||||
"value": "test"
|
||||
}
|
||||
],
|
||||
@@ -4303,7 +4303,7 @@
|
||||
"time": 0.603,
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title__icontains=test",
|
||||
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_search=test",
|
||||
"httpVersion": "HTTP/1.1",
|
||||
"cookies": [],
|
||||
"headers": [
|
||||
@@ -4337,7 +4337,7 @@
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "title__icontains",
|
||||
"name": "title_search",
|
||||
"value": "test"
|
||||
}
|
||||
],
|
||||
|
||||
@@ -24,7 +24,7 @@ import {
|
||||
FILTER_HAS_DOCUMENT_TYPE_ANY,
|
||||
FILTER_HAS_STORAGE_PATH_ANY,
|
||||
FILTER_HAS_TAGS_ALL,
|
||||
FILTER_TITLE_CONTENT,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
} from 'src/app/data/filter-rule-type'
|
||||
import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service'
|
||||
@@ -545,7 +545,7 @@ describe('GlobalSearchComponent', () => {
|
||||
component.query = 'test'
|
||||
component.runFullSearch()
|
||||
expect(qfSpy).toHaveBeenCalledWith([
|
||||
{ rule_type: FILTER_TITLE_CONTENT, value: 'test' },
|
||||
{ rule_type: FILTER_SIMPLE_TEXT, value: 'test' },
|
||||
])
|
||||
|
||||
settingsService.set(
|
||||
|
||||
@@ -25,7 +25,7 @@ import {
|
||||
FILTER_HAS_DOCUMENT_TYPE_ANY,
|
||||
FILTER_HAS_STORAGE_PATH_ANY,
|
||||
FILTER_HAS_TAGS_ALL,
|
||||
FILTER_TITLE_CONTENT,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
} from 'src/app/data/filter-rule-type'
|
||||
import { ObjectWithId } from 'src/app/data/object-with-id'
|
||||
import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
@@ -410,7 +410,7 @@ export class GlobalSearchComponent implements OnInit {
|
||||
public runFullSearch() {
|
||||
const ruleType = this.useAdvancedForFullSearch
|
||||
? FILTER_FULLTEXT_QUERY
|
||||
: FILTER_TITLE_CONTENT
|
||||
: FILTER_SIMPLE_TEXT
|
||||
this.documentService.searchQuery = this.useAdvancedForFullSearch
|
||||
? this.query
|
||||
: ''
|
||||
|
||||
@@ -4,7 +4,7 @@ import { ComponentFixture, TestBed } from '@angular/core/testing'
|
||||
import { By } from '@angular/platform-browser'
|
||||
import { NgbAccordionButton, NgbActiveModal } from '@ng-bootstrap/ng-bootstrap'
|
||||
import { of, throwError } from 'rxjs'
|
||||
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { DocumentService } from 'src/app/services/rest/document.service'
|
||||
import { StoragePathService } from 'src/app/services/rest/storage-path.service'
|
||||
import { SettingsService } from 'src/app/services/settings.service'
|
||||
@@ -105,7 +105,7 @@ describe('StoragePathEditDialogComponent', () => {
|
||||
null,
|
||||
'created',
|
||||
true,
|
||||
[{ rule_type: FILTER_TITLE, value: 'bar' }],
|
||||
[{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }],
|
||||
{ truncate_content: true }
|
||||
)
|
||||
listSpy.mockReturnValueOnce(
|
||||
|
||||
@@ -23,7 +23,7 @@ import {
|
||||
} from 'rxjs'
|
||||
import { EditDialogComponent } from 'src/app/components/common/edit-dialog/edit-dialog.component'
|
||||
import { Document } from 'src/app/data/document'
|
||||
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { DEFAULT_MATCHING_ALGORITHM } from 'src/app/data/matching-model'
|
||||
import { StoragePath } from 'src/app/data/storage-path'
|
||||
import { IfOwnerDirective } from 'src/app/directives/if-owner.directive'
|
||||
@@ -146,7 +146,7 @@ export class StoragePathEditDialogComponent
|
||||
null,
|
||||
'created',
|
||||
true,
|
||||
[{ rule_type: FILTER_TITLE, value: title }],
|
||||
[{ rule_type: FILTER_SIMPLE_TITLE, value: title }],
|
||||
{ truncate_content: true }
|
||||
)
|
||||
.pipe(
|
||||
|
||||
@@ -3,7 +3,7 @@ import { provideHttpClientTesting } from '@angular/common/http/testing'
|
||||
import { ComponentFixture, TestBed } from '@angular/core/testing'
|
||||
import { NG_VALUE_ACCESSOR } from '@angular/forms'
|
||||
import { of, throwError } from 'rxjs'
|
||||
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { DocumentService } from 'src/app/services/rest/document.service'
|
||||
import { DocumentLinkComponent } from './document-link.component'
|
||||
|
||||
@@ -99,7 +99,7 @@ describe('DocumentLinkComponent', () => {
|
||||
null,
|
||||
'created',
|
||||
true,
|
||||
[{ rule_type: FILTER_TITLE, value: 'bar' }],
|
||||
[{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }],
|
||||
{ truncate_content: true }
|
||||
)
|
||||
listSpy.mockReturnValueOnce(throwError(() => new Error()))
|
||||
|
||||
@@ -28,7 +28,7 @@ import {
|
||||
tap,
|
||||
} from 'rxjs'
|
||||
import { Document } from 'src/app/data/document'
|
||||
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
|
||||
import { DocumentService } from 'src/app/services/rest/document.service'
|
||||
import { AbstractInputComponent } from '../abstract-input'
|
||||
@@ -121,7 +121,7 @@ export class DocumentLinkComponent
|
||||
null,
|
||||
'created',
|
||||
true,
|
||||
[{ rule_type: FILTER_TITLE, value: title }],
|
||||
[{ rule_type: FILTER_SIMPLE_TITLE, value: title }],
|
||||
{ truncate_content: true }
|
||||
)
|
||||
.pipe(
|
||||
|
||||
@@ -428,7 +428,7 @@ describe('BulkEditorComponent', () => {
|
||||
req.flush(true)
|
||||
expect(req.request.body).toEqual({
|
||||
all: true,
|
||||
filters: { title__icontains: 'apple' },
|
||||
filters: { title_search: 'apple' },
|
||||
method: 'modify_tags',
|
||||
parameters: { add_tags: [101], remove_tags: [] },
|
||||
})
|
||||
|
||||
@@ -67,6 +67,8 @@ import {
|
||||
FILTER_OWNER_DOES_NOT_INCLUDE,
|
||||
FILTER_OWNER_ISNULL,
|
||||
FILTER_SHARED_BY_USER,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
FILTER_SIMPLE_TITLE,
|
||||
FILTER_STORAGE_PATH,
|
||||
FILTER_TITLE,
|
||||
FILTER_TITLE_CONTENT,
|
||||
@@ -312,7 +314,7 @@ describe('FilterEditorComponent', () => {
|
||||
expect(component.textFilter).toEqual(null)
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
rule_type: FILTER_SIMPLE_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
@@ -320,6 +322,18 @@ describe('FilterEditorComponent', () => {
|
||||
expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT
|
||||
}))
|
||||
|
||||
it('should ingest legacy text filter rules for doc title + content', fakeAsync(() => {
|
||||
expect(component.textFilter).toEqual(null)
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
value: 'legacy foo',
|
||||
},
|
||||
]
|
||||
expect(component.textFilter).toEqual('legacy foo')
|
||||
expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT
|
||||
}))
|
||||
|
||||
it('should ingest text filter rules for doc asn', fakeAsync(() => {
|
||||
expect(component.textFilter).toEqual(null)
|
||||
component.filterRules = [
|
||||
@@ -1117,7 +1131,7 @@ describe('FilterEditorComponent', () => {
|
||||
expect(component.textFilter).toEqual('foo')
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
rule_type: FILTER_SIMPLE_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
])
|
||||
@@ -1136,7 +1150,7 @@ describe('FilterEditorComponent', () => {
|
||||
expect(component.textFilterTarget).toEqual('title')
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_TITLE,
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: 'foo',
|
||||
},
|
||||
])
|
||||
@@ -1250,30 +1264,12 @@ describe('FilterEditorComponent', () => {
|
||||
])
|
||||
}))
|
||||
|
||||
it('should convert user input to correct filter rules on custom fields query', fakeAsync(() => {
|
||||
component.textFilterInput.nativeElement.value = 'foo'
|
||||
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
|
||||
const textFieldTargetDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(NgbDropdownItem)
|
||||
)[3]
|
||||
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_CUSTOM_FIELDS
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
expect(component.textFilterTarget).toEqual('custom-fields')
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
])
|
||||
}))
|
||||
|
||||
it('should convert user input to correct filter rules on mime type', fakeAsync(() => {
|
||||
component.textFilterInput.nativeElement.value = 'pdf'
|
||||
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
|
||||
const textFieldTargetDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(NgbDropdownItem)
|
||||
)[4]
|
||||
)[3]
|
||||
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_MIME_TYPE
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
@@ -1291,8 +1287,8 @@ describe('FilterEditorComponent', () => {
|
||||
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
|
||||
const textFieldTargetDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(NgbDropdownItem)
|
||||
)[5]
|
||||
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_ASN
|
||||
)[4]
|
||||
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_FULLTEXT_QUERY
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
expect(component.textFilterTarget).toEqual('fulltext-query')
|
||||
@@ -1696,12 +1692,56 @@ describe('FilterEditorComponent', () => {
|
||||
])
|
||||
}))
|
||||
|
||||
it('should convert legacy title filters into full text query when adding a created relative date', fakeAsync(() => {
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_TITLE,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
const dateCreatedDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(DatesDropdownComponent)
|
||||
)[0]
|
||||
component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK
|
||||
dateCreatedDropdown.triggerEventHandler('datesSet')
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_FULLTEXT_QUERY,
|
||||
value: 'foo,created:[-1 week to now]',
|
||||
},
|
||||
])
|
||||
}))
|
||||
|
||||
it('should convert simple title filters into full text query when adding a created relative date', fakeAsync(() => {
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
const dateCreatedDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(DatesDropdownComponent)
|
||||
)[0]
|
||||
component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK
|
||||
dateCreatedDropdown.triggerEventHandler('datesSet')
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_FULLTEXT_QUERY,
|
||||
value: 'foo,created:[-1 week to now]',
|
||||
},
|
||||
])
|
||||
}))
|
||||
|
||||
it('should leave relative dates not in quick list intact', fakeAsync(() => {
|
||||
component.textFilterInput.nativeElement.value = 'created:[-2 week to now]'
|
||||
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
|
||||
const textFieldTargetDropdown = fixture.debugElement.queryAll(
|
||||
By.directive(NgbDropdownItem)
|
||||
)[5]
|
||||
)[4]
|
||||
textFieldTargetDropdown.triggerEventHandler('click')
|
||||
fixture.detectChanges()
|
||||
tick(400)
|
||||
@@ -2031,12 +2071,30 @@ describe('FilterEditorComponent', () => {
|
||||
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_TITLE,
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
expect(component.generateFilterName()).toEqual('Title: foo')
|
||||
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
value: 'legacy foo',
|
||||
},
|
||||
]
|
||||
expect(component.generateFilterName()).toEqual(
|
||||
'Title & content: legacy foo'
|
||||
)
|
||||
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_SIMPLE_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
expect(component.generateFilterName()).toEqual('Title & content: foo')
|
||||
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_ASN,
|
||||
@@ -2156,6 +2214,36 @@ describe('FilterEditorComponent', () => {
|
||||
})
|
||||
})
|
||||
|
||||
it('should hide deprecated custom fields target from default text filter targets', () => {
|
||||
expect(component.textFilterTargets).not.toContainEqual({
|
||||
id: 'custom-fields',
|
||||
name: $localize`Custom fields (Deprecated)`,
|
||||
})
|
||||
})
|
||||
|
||||
it('should keep deprecated custom fields target available for legacy filters', fakeAsync(() => {
|
||||
component.filterRules = [
|
||||
{
|
||||
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
]
|
||||
fixture.detectChanges()
|
||||
tick()
|
||||
|
||||
expect(component.textFilterTarget).toEqual('custom-fields')
|
||||
expect(component.textFilterTargets).toContainEqual({
|
||||
id: 'custom-fields',
|
||||
name: $localize`Custom fields (Deprecated)`,
|
||||
})
|
||||
expect(component.filterRules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
|
||||
value: 'foo',
|
||||
},
|
||||
])
|
||||
}))
|
||||
|
||||
it('should call autocomplete endpoint on input', fakeAsync(() => {
|
||||
component.textFilterTarget = 'fulltext-query' // TEXT_FILTER_TARGET_FULLTEXT_QUERY
|
||||
const autocompleteSpy = jest.spyOn(searchService, 'autocomplete')
|
||||
|
||||
@@ -71,6 +71,8 @@ import {
|
||||
FILTER_OWNER_DOES_NOT_INCLUDE,
|
||||
FILTER_OWNER_ISNULL,
|
||||
FILTER_SHARED_BY_USER,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
FILTER_SIMPLE_TITLE,
|
||||
FILTER_STORAGE_PATH,
|
||||
FILTER_TITLE,
|
||||
FILTER_TITLE_CONTENT,
|
||||
@@ -195,10 +197,6 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [
|
||||
name: $localize`Title & content`,
|
||||
},
|
||||
{ id: TEXT_FILTER_TARGET_ASN, name: $localize`ASN` },
|
||||
{
|
||||
id: TEXT_FILTER_TARGET_CUSTOM_FIELDS,
|
||||
name: $localize`Custom fields`,
|
||||
},
|
||||
{ id: TEXT_FILTER_TARGET_MIME_TYPE, name: $localize`File type` },
|
||||
{
|
||||
id: TEXT_FILTER_TARGET_FULLTEXT_QUERY,
|
||||
@@ -206,6 +204,12 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [
|
||||
},
|
||||
]
|
||||
|
||||
const DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION = {
|
||||
// Kept only so legacy saved views can render and be edited away from, remove me eventually
|
||||
id: TEXT_FILTER_TARGET_CUSTOM_FIELDS,
|
||||
name: $localize`Custom fields (Deprecated)`,
|
||||
}
|
||||
|
||||
const TEXT_FILTER_TARGET_MORELIKE_OPTION = {
|
||||
id: TEXT_FILTER_TARGET_FULLTEXT_MORELIKE,
|
||||
name: $localize`More like`,
|
||||
@@ -318,8 +322,13 @@ export class FilterEditorComponent
|
||||
return $localize`Custom fields query`
|
||||
|
||||
case FILTER_TITLE:
|
||||
case FILTER_SIMPLE_TITLE:
|
||||
return $localize`Title: ${rule.value}`
|
||||
|
||||
case FILTER_TITLE_CONTENT:
|
||||
case FILTER_SIMPLE_TEXT:
|
||||
return $localize`Title & content: ${rule.value}`
|
||||
|
||||
case FILTER_ASN:
|
||||
return $localize`ASN: ${rule.value}`
|
||||
|
||||
@@ -353,12 +362,16 @@ export class FilterEditorComponent
|
||||
_moreLikeDoc: Document
|
||||
|
||||
get textFilterTargets() {
|
||||
let targets = DEFAULT_TEXT_FILTER_TARGET_OPTIONS
|
||||
if (this.textFilterTarget == TEXT_FILTER_TARGET_FULLTEXT_MORELIKE) {
|
||||
return DEFAULT_TEXT_FILTER_TARGET_OPTIONS.concat([
|
||||
TEXT_FILTER_TARGET_MORELIKE_OPTION,
|
||||
targets = targets.concat([TEXT_FILTER_TARGET_MORELIKE_OPTION])
|
||||
}
|
||||
if (this.textFilterTarget == TEXT_FILTER_TARGET_CUSTOM_FIELDS) {
|
||||
targets = targets.concat([
|
||||
DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION,
|
||||
])
|
||||
}
|
||||
return DEFAULT_TEXT_FILTER_TARGET_OPTIONS
|
||||
return targets
|
||||
}
|
||||
|
||||
textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT
|
||||
@@ -437,10 +450,12 @@ export class FilterEditorComponent
|
||||
value.forEach((rule) => {
|
||||
switch (rule.rule_type) {
|
||||
case FILTER_TITLE:
|
||||
case FILTER_SIMPLE_TITLE:
|
||||
this._textFilter = rule.value
|
||||
this.textFilterTarget = TEXT_FILTER_TARGET_TITLE
|
||||
break
|
||||
case FILTER_TITLE_CONTENT:
|
||||
case FILTER_SIMPLE_TEXT:
|
||||
this._textFilter = rule.value
|
||||
this.textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT
|
||||
break
|
||||
@@ -762,12 +777,15 @@ export class FilterEditorComponent
|
||||
this.textFilterTarget == TEXT_FILTER_TARGET_TITLE_CONTENT
|
||||
) {
|
||||
filterRules.push({
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
rule_type: FILTER_SIMPLE_TEXT,
|
||||
value: this._textFilter.trim(),
|
||||
})
|
||||
}
|
||||
if (this._textFilter && this.textFilterTarget == TEXT_FILTER_TARGET_TITLE) {
|
||||
filterRules.push({ rule_type: FILTER_TITLE, value: this._textFilter })
|
||||
filterRules.push({
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: this._textFilter,
|
||||
})
|
||||
}
|
||||
if (this.textFilterTarget == TEXT_FILTER_TARGET_ASN) {
|
||||
if (
|
||||
@@ -1009,7 +1027,10 @@ export class FilterEditorComponent
|
||||
) {
|
||||
existingRule = filterRules.find(
|
||||
(fr) =>
|
||||
fr.rule_type == FILTER_TITLE_CONTENT || fr.rule_type == FILTER_TITLE
|
||||
fr.rule_type == FILTER_TITLE_CONTENT ||
|
||||
fr.rule_type == FILTER_SIMPLE_TEXT ||
|
||||
fr.rule_type == FILTER_TITLE ||
|
||||
fr.rule_type == FILTER_SIMPLE_TITLE
|
||||
)
|
||||
existingRule.rule_type = FILTER_FULLTEXT_QUERY
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ import { DataType } from './datatype'
|
||||
export const NEGATIVE_NULL_FILTER_VALUE = -1
|
||||
|
||||
// These correspond to src/documents/models.py and changes here require a DB migration (and vice versa)
|
||||
export const FILTER_TITLE = 0
|
||||
export const FILTER_TITLE = 0 // Deprecated in favor of Tantivy-backed `title_search`. Keep for now for existing saved views
|
||||
export const FILTER_CONTENT = 1
|
||||
|
||||
export const FILTER_ASN = 2
|
||||
@@ -46,7 +46,9 @@ export const FILTER_ADDED_FROM = 46
|
||||
export const FILTER_MODIFIED_BEFORE = 15
|
||||
export const FILTER_MODIFIED_AFTER = 16
|
||||
|
||||
export const FILTER_TITLE_CONTENT = 19
|
||||
export const FILTER_TITLE_CONTENT = 19 // Deprecated in favor of Tantivy-backed `text` filtervar. Keep for now for existing saved views
|
||||
export const FILTER_SIMPLE_TITLE = 48
|
||||
export const FILTER_SIMPLE_TEXT = 49
|
||||
export const FILTER_FULLTEXT_QUERY = 20
|
||||
export const FILTER_FULLTEXT_MORELIKE = 21
|
||||
|
||||
@@ -56,7 +58,7 @@ export const FILTER_OWNER_ISNULL = 34
|
||||
export const FILTER_OWNER_DOES_NOT_INCLUDE = 35
|
||||
export const FILTER_SHARED_BY_USER = 37
|
||||
|
||||
export const FILTER_CUSTOM_FIELDS_TEXT = 36
|
||||
export const FILTER_CUSTOM_FIELDS_TEXT = 36 // Deprecated. UI no longer includes CF text-search mode. Keep for now for existing saved views
|
||||
export const FILTER_HAS_CUSTOM_FIELDS_ALL = 38
|
||||
export const FILTER_HAS_CUSTOM_FIELDS_ANY = 39
|
||||
export const FILTER_DOES_NOT_HAVE_CUSTOM_FIELDS = 40
|
||||
@@ -66,6 +68,9 @@ export const FILTER_CUSTOM_FIELDS_QUERY = 42
|
||||
|
||||
export const FILTER_MIME_TYPE = 47
|
||||
|
||||
export const SIMPLE_TEXT_PARAMETER = 'text'
|
||||
export const SIMPLE_TITLE_PARAMETER = 'title_search'
|
||||
|
||||
export const FILTER_RULE_TYPES: FilterRuleType[] = [
|
||||
{
|
||||
id: FILTER_TITLE,
|
||||
@@ -74,6 +79,13 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [
|
||||
multi: false,
|
||||
default: '',
|
||||
},
|
||||
{
|
||||
id: FILTER_SIMPLE_TITLE,
|
||||
filtervar: SIMPLE_TITLE_PARAMETER,
|
||||
datatype: 'string',
|
||||
multi: false,
|
||||
default: '',
|
||||
},
|
||||
{
|
||||
id: FILTER_CONTENT,
|
||||
filtervar: 'content__icontains',
|
||||
@@ -279,6 +291,12 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [
|
||||
datatype: 'string',
|
||||
multi: false,
|
||||
},
|
||||
{
|
||||
id: FILTER_SIMPLE_TEXT,
|
||||
filtervar: SIMPLE_TEXT_PARAMETER,
|
||||
datatype: 'string',
|
||||
multi: false,
|
||||
},
|
||||
{
|
||||
id: FILTER_FULLTEXT_QUERY,
|
||||
filtervar: 'query',
|
||||
|
||||
@@ -10,7 +10,7 @@ import {
|
||||
DOCUMENT_SORT_FIELDS,
|
||||
DOCUMENT_SORT_FIELDS_FULLTEXT,
|
||||
} from 'src/app/data/document'
|
||||
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
|
||||
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
import { environment } from 'src/environments/environment'
|
||||
import { PermissionsService } from '../permissions.service'
|
||||
@@ -138,13 +138,13 @@ describe(`DocumentService`, () => {
|
||||
subscription = service
|
||||
.listAllFilteredIds([
|
||||
{
|
||||
rule_type: FILTER_TITLE,
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: 'apple',
|
||||
},
|
||||
])
|
||||
.subscribe()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title__icontains=apple`
|
||||
`${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title_search=apple`
|
||||
)
|
||||
expect(req.request.method).toEqual('GET')
|
||||
})
|
||||
|
||||
@@ -8,6 +8,10 @@ import {
|
||||
FILTER_HAS_CUSTOM_FIELDS_ALL,
|
||||
FILTER_HAS_CUSTOM_FIELDS_ANY,
|
||||
FILTER_HAS_TAGS_ALL,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
FILTER_SIMPLE_TITLE,
|
||||
FILTER_TITLE,
|
||||
FILTER_TITLE_CONTENT,
|
||||
NEGATIVE_NULL_FILTER_VALUE,
|
||||
} from '../data/filter-rule-type'
|
||||
import {
|
||||
@@ -128,6 +132,26 @@ describe('QueryParams Utils', () => {
|
||||
is_tagged: 0,
|
||||
})
|
||||
|
||||
params = queryParamsFromFilterRules([
|
||||
{
|
||||
rule_type: FILTER_TITLE_CONTENT,
|
||||
value: 'bank statement',
|
||||
},
|
||||
])
|
||||
expect(params).toEqual({
|
||||
text: 'bank statement',
|
||||
})
|
||||
|
||||
params = queryParamsFromFilterRules([
|
||||
{
|
||||
rule_type: FILTER_TITLE,
|
||||
value: 'invoice',
|
||||
},
|
||||
])
|
||||
expect(params).toEqual({
|
||||
title_search: 'invoice',
|
||||
})
|
||||
|
||||
params = queryParamsFromFilterRules([
|
||||
{
|
||||
rule_type: FILTER_HAS_TAGS_ALL,
|
||||
@@ -148,6 +172,30 @@ describe('QueryParams Utils', () => {
|
||||
|
||||
it('should convert filter rules to query params', () => {
|
||||
let rules = filterRulesFromQueryParams(
|
||||
convertToParamMap({
|
||||
text: 'bank statement',
|
||||
})
|
||||
)
|
||||
expect(rules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_SIMPLE_TEXT,
|
||||
value: 'bank statement',
|
||||
},
|
||||
])
|
||||
|
||||
rules = filterRulesFromQueryParams(
|
||||
convertToParamMap({
|
||||
title_search: 'invoice',
|
||||
})
|
||||
)
|
||||
expect(rules).toEqual([
|
||||
{
|
||||
rule_type: FILTER_SIMPLE_TITLE,
|
||||
value: 'invoice',
|
||||
},
|
||||
])
|
||||
|
||||
rules = filterRulesFromQueryParams(
|
||||
convertToParamMap({
|
||||
tags__id__all,
|
||||
})
|
||||
|
||||
@@ -9,8 +9,14 @@ import {
|
||||
FILTER_HAS_CUSTOM_FIELDS_ALL,
|
||||
FILTER_HAS_CUSTOM_FIELDS_ANY,
|
||||
FILTER_RULE_TYPES,
|
||||
FILTER_SIMPLE_TEXT,
|
||||
FILTER_SIMPLE_TITLE,
|
||||
FILTER_TITLE,
|
||||
FILTER_TITLE_CONTENT,
|
||||
FilterRuleType,
|
||||
NEGATIVE_NULL_FILTER_VALUE,
|
||||
SIMPLE_TEXT_PARAMETER,
|
||||
SIMPLE_TITLE_PARAMETER,
|
||||
} from '../data/filter-rule-type'
|
||||
import { ListViewState } from '../services/document-list-view.service'
|
||||
|
||||
@@ -97,6 +103,8 @@ export function transformLegacyFilterRules(
|
||||
export function filterRulesFromQueryParams(
|
||||
queryParams: ParamMap
|
||||
): FilterRule[] {
|
||||
let filterRulesFromQueryParams: FilterRule[] = []
|
||||
|
||||
const allFilterRuleQueryParams: string[] = FILTER_RULE_TYPES.map(
|
||||
(rt) => rt.filtervar
|
||||
)
|
||||
@@ -104,7 +112,6 @@ export function filterRulesFromQueryParams(
|
||||
.filter((rt) => rt !== undefined)
|
||||
|
||||
// transform query params to filter rules
|
||||
let filterRulesFromQueryParams: FilterRule[] = []
|
||||
allFilterRuleQueryParams
|
||||
.filter((frqp) => queryParams.has(frqp))
|
||||
.forEach((filterQueryParamName) => {
|
||||
@@ -146,7 +153,17 @@ export function queryParamsFromFilterRules(filterRules: FilterRule[]): Params {
|
||||
let params = {}
|
||||
for (let rule of filterRules) {
|
||||
let ruleType = FILTER_RULE_TYPES.find((t) => t.id == rule.rule_type)
|
||||
if (ruleType.isnull_filtervar && rule.value == null) {
|
||||
if (
|
||||
rule.rule_type === FILTER_TITLE_CONTENT ||
|
||||
rule.rule_type === FILTER_SIMPLE_TEXT
|
||||
) {
|
||||
params[SIMPLE_TEXT_PARAMETER] = rule.value
|
||||
} else if (
|
||||
rule.rule_type === FILTER_TITLE ||
|
||||
rule.rule_type === FILTER_SIMPLE_TITLE
|
||||
) {
|
||||
params[SIMPLE_TITLE_PARAMETER] = rule.value
|
||||
} else if (ruleType.isnull_filtervar && rule.value == null) {
|
||||
params[ruleType.isnull_filtervar] = 1
|
||||
} else if (
|
||||
ruleType.isnull_filtervar &&
|
||||
|
||||
@@ -50,14 +50,9 @@ from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
@@ -110,44 +105,6 @@ class ConsumerStatusShortMessage(StrEnum):
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
def should_produce_archive(
|
||||
parser: "ParserProtocol",
|
||||
mime_type: str,
|
||||
document_path: Path,
|
||||
) -> bool:
|
||||
"""Return True if a PDF/A archive should be produced for this document.
|
||||
|
||||
IMPORTANT: *parser* must be an instantiated parser, not the class.
|
||||
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
|
||||
``@property`` methods — accessing them on the class returns the descriptor
|
||||
(always truthy).
|
||||
"""
|
||||
# Must produce a PDF so the frontend can display the original format at all.
|
||||
if parser.requires_pdf_rendition:
|
||||
return True
|
||||
|
||||
# Parser cannot produce an archive (e.g. TextDocumentParser).
|
||||
if not parser.can_produce_archive:
|
||||
return False
|
||||
|
||||
generation = OcrConfig().archive_file_generation
|
||||
|
||||
if generation == ArchiveFileGenerationChoices.ALWAYS:
|
||||
return True
|
||||
if generation == ArchiveFileGenerationChoices.NEVER:
|
||||
return False
|
||||
|
||||
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
|
||||
if mime_type.startswith("image/"):
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
if is_tagged_pdf(document_path):
|
||||
return False
|
||||
text = extract_pdf_text(document_path)
|
||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
||||
return False
|
||||
|
||||
|
||||
class ConsumerPluginMixin:
|
||||
if TYPE_CHECKING:
|
||||
from logging import Logger
|
||||
@@ -481,16 +438,7 @@ class ConsumerPlugin(
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
produce_archive = should_produce_archive(
|
||||
document_parser,
|
||||
mime_type,
|
||||
self.working_copy,
|
||||
)
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
@@ -839,7 +787,7 @@ class ConsumerPlugin(
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document: Document) -> None:
|
||||
def apply_overrides(self, document) -> None:
|
||||
if self.metadata.correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.metadata.correspondent_id,
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import functools
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import operator
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -77,6 +78,8 @@ DATETIME_KWARGS = [
|
||||
CUSTOM_FIELD_QUERY_MAX_DEPTH = 10
|
||||
CUSTOM_FIELD_QUERY_MAX_ATOMS = 20
|
||||
|
||||
logger = logging.getLogger("paperless.api")
|
||||
|
||||
|
||||
class CorrespondentFilterSet(FilterSet):
|
||||
class Meta:
|
||||
@@ -162,9 +165,13 @@ class InboxFilter(Filter):
|
||||
|
||||
@extend_schema_field(serializers.CharField)
|
||||
class TitleContentFilter(Filter):
|
||||
# Deprecated but retained for existing saved views. UI uses Tantivy-backed `text` / `title_search` params.
|
||||
def filter(self, qs: Any, value: Any) -> Any:
|
||||
value = value.strip() if isinstance(value, str) else value
|
||||
if value:
|
||||
logger.warning(
|
||||
"Deprecated document filter parameter 'title_content' used; use `text` instead.",
|
||||
)
|
||||
try:
|
||||
return qs.filter(
|
||||
Q(title__icontains=value) | Q(effective_content__icontains=value),
|
||||
@@ -243,6 +250,9 @@ class CustomFieldsFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
value = value.strip() if isinstance(value, str) else value
|
||||
if value:
|
||||
logger.warning(
|
||||
"Deprecated document filter parameter 'custom_fields__icontains' used; use `custom_field_query` or advanced Tantivy field syntax instead.",
|
||||
)
|
||||
fields_with_matching_selects = CustomField.objects.filter(
|
||||
extra_data__icontains=value,
|
||||
)
|
||||
@@ -747,6 +757,7 @@ class DocumentFilterSet(FilterSet):
|
||||
|
||||
is_in_inbox = InboxFilter()
|
||||
|
||||
# Deprecated, but keep for now for existing saved views
|
||||
title_content = TitleContentFilter()
|
||||
|
||||
content__istartswith = EffectiveContentFilter(lookup_expr="istartswith")
|
||||
@@ -756,6 +767,7 @@ class DocumentFilterSet(FilterSet):
|
||||
|
||||
owner__id__none = ObjectFilter(field_name="owner", exclude=True)
|
||||
|
||||
# Deprecated, UI no longer includes CF text-search mode, but keep for now for existing saved views
|
||||
custom_fields__icontains = CustomFieldsFilter()
|
||||
|
||||
custom_fields__id__all = ObjectFilter(field_name="custom_fields__field")
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
# Generated by Django 5.2.12 on 2026-04-01 18:20
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
OLD_TITLE_RULE = 0
|
||||
OLD_TITLE_CONTENT_RULE = 19
|
||||
NEW_SIMPLE_TITLE_RULE = 48
|
||||
NEW_SIMPLE_TEXT_RULE = 49
|
||||
|
||||
|
||||
# See documents/models.py SavedViewFilterRule
|
||||
def migrate_saved_view_rules_forward(apps, schema_editor):
|
||||
SavedViewFilterRule = apps.get_model("documents", "SavedViewFilterRule")
|
||||
SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_RULE).update(
|
||||
rule_type=NEW_SIMPLE_TITLE_RULE,
|
||||
)
|
||||
SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_CONTENT_RULE).update(
|
||||
rule_type=NEW_SIMPLE_TEXT_RULE,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0017_migrate_fulltext_query_field_prefixes"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="savedviewfilterrule",
|
||||
name="rule_type",
|
||||
field=models.PositiveSmallIntegerField(
|
||||
choices=[
|
||||
(0, "title contains"),
|
||||
(1, "content contains"),
|
||||
(2, "ASN is"),
|
||||
(3, "correspondent is"),
|
||||
(4, "document type is"),
|
||||
(5, "is in inbox"),
|
||||
(6, "has tag"),
|
||||
(7, "has any tag"),
|
||||
(8, "created before"),
|
||||
(9, "created after"),
|
||||
(10, "created year is"),
|
||||
(11, "created month is"),
|
||||
(12, "created day is"),
|
||||
(13, "added before"),
|
||||
(14, "added after"),
|
||||
(15, "modified before"),
|
||||
(16, "modified after"),
|
||||
(17, "does not have tag"),
|
||||
(18, "does not have ASN"),
|
||||
(19, "title or content contains"),
|
||||
(20, "fulltext query"),
|
||||
(21, "more like this"),
|
||||
(22, "has tags in"),
|
||||
(23, "ASN greater than"),
|
||||
(24, "ASN less than"),
|
||||
(25, "storage path is"),
|
||||
(26, "has correspondent in"),
|
||||
(27, "does not have correspondent in"),
|
||||
(28, "has document type in"),
|
||||
(29, "does not have document type in"),
|
||||
(30, "has storage path in"),
|
||||
(31, "does not have storage path in"),
|
||||
(32, "owner is"),
|
||||
(33, "has owner in"),
|
||||
(34, "does not have owner"),
|
||||
(35, "does not have owner in"),
|
||||
(36, "has custom field value"),
|
||||
(37, "is shared by me"),
|
||||
(38, "has custom fields"),
|
||||
(39, "has custom field in"),
|
||||
(40, "does not have custom field in"),
|
||||
(41, "does not have custom field"),
|
||||
(42, "custom fields query"),
|
||||
(43, "created to"),
|
||||
(44, "created from"),
|
||||
(45, "added to"),
|
||||
(46, "added from"),
|
||||
(47, "mime type is"),
|
||||
(48, "simple title search"),
|
||||
(49, "simple text search"),
|
||||
],
|
||||
verbose_name="rule type",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(
|
||||
migrate_saved_view_rules_forward,
|
||||
migrations.RunPython.noop,
|
||||
),
|
||||
]
|
||||
@@ -623,6 +623,8 @@ class SavedViewFilterRule(models.Model):
|
||||
(45, _("added to")),
|
||||
(46, _("added from")),
|
||||
(47, _("mime type is")),
|
||||
(48, _("simple title search")),
|
||||
(49, _("simple text search")),
|
||||
]
|
||||
|
||||
saved_view = models.ForeignKey(
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from documents.search._backend import SORT_FIELD_MAP
|
||||
from documents.search._backend import SearchIndexLockError
|
||||
from documents.search._backend import SearchMode
|
||||
from documents.search._backend import SearchResults
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import TantivyRelevanceList
|
||||
@@ -9,7 +11,9 @@ from documents.search._schema import needs_rebuild
|
||||
from documents.search._schema import wipe_index
|
||||
|
||||
__all__ = [
|
||||
"SORT_FIELD_MAP",
|
||||
"SearchIndexLockError",
|
||||
"SearchMode",
|
||||
"SearchResults",
|
||||
"TantivyBackend",
|
||||
"TantivyRelevanceList",
|
||||
|
||||
@@ -2,11 +2,11 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
from typing import TypedDict
|
||||
@@ -19,7 +19,10 @@ from django.conf import settings
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
from documents.search._normalize import ascii_fold
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import parse_simple_text_query
|
||||
from documents.search._query import parse_simple_title_query
|
||||
from documents.search._query import parse_user_query
|
||||
from documents.search._schema import _write_sentinels
|
||||
from documents.search._schema import build_schema
|
||||
@@ -44,15 +47,25 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
# Fields tantivy can sort natively — maps Django ORM field names to tantivy schema fields.
|
||||
# Fields not listed here (owner, storage_path__name, id, custom_field_*) must fall back to ORM.
|
||||
SORT_FIELD_MAP: dict[str, str] = {
|
||||
"title": "title_sort",
|
||||
"correspondent__name": "correspondent_sort",
|
||||
"document_type__name": "type_sort",
|
||||
"created": "created",
|
||||
"added": "added",
|
||||
"modified": "modified",
|
||||
"archive_serial_number": "asn",
|
||||
"page_count": "page_count",
|
||||
"num_notes": "num_notes",
|
||||
}
|
||||
|
||||
def _ascii_fold(s: str) -> str:
|
||||
"""
|
||||
Normalize unicode to ASCII equivalent characters for search consistency.
|
||||
|
||||
Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
|
||||
to enable cross-language searching without requiring exact diacritic matching.
|
||||
"""
|
||||
return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
|
||||
class SearchMode(StrEnum):
|
||||
QUERY = "query"
|
||||
TEXT = "text"
|
||||
TITLE = "title"
|
||||
|
||||
|
||||
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
|
||||
@@ -74,7 +87,7 @@ def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
|
||||
)
|
||||
continue
|
||||
for token in tokens:
|
||||
normalized = _ascii_fold(token.lower())
|
||||
normalized = ascii_fold(token.lower())
|
||||
if normalized:
|
||||
words.add(normalized)
|
||||
return words
|
||||
@@ -294,8 +307,10 @@ class TantivyBackend:
|
||||
doc.add_text("checksum", document.checksum)
|
||||
doc.add_text("title", document.title)
|
||||
doc.add_text("title_sort", document.title)
|
||||
doc.add_text("simple_title", document.title)
|
||||
doc.add_text("content", content)
|
||||
doc.add_text("bigram_content", content)
|
||||
doc.add_text("simple_content", content)
|
||||
|
||||
# Original filename - only add if not None/empty
|
||||
if document.original_filename:
|
||||
@@ -433,6 +448,7 @@ class TantivyBackend:
|
||||
sort_field: str | None,
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
) -> SearchResults:
|
||||
"""
|
||||
Execute a search query against the document index.
|
||||
@@ -441,20 +457,32 @@ class TantivyBackend:
|
||||
permission filtering before executing against Tantivy. Supports both
|
||||
relevance-based and field-based sorting.
|
||||
|
||||
QUERY search mode supports natural date keywords, field filters, etc.
|
||||
TITLE search mode treats the query as plain text to search for in title only
|
||||
TEXT search mode treats the query as plain text to search for in title and content
|
||||
|
||||
Args:
|
||||
query: User's search query (supports natural date keywords, field filters)
|
||||
query: User's search query
|
||||
user: User for permission filtering (None for superuser/no filtering)
|
||||
page: Page number (1-indexed) for pagination
|
||||
page_size: Number of results per page
|
||||
sort_field: Field to sort by (None for relevance ranking)
|
||||
sort_reverse: Whether to reverse the sort order
|
||||
search_mode: "query" for advanced Tantivy syntax, "text" for
|
||||
plain-text search over title and content only, "title" for
|
||||
plain-text search over title only
|
||||
|
||||
Returns:
|
||||
SearchResults with hits, total count, and processed query
|
||||
"""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
if search_mode is SearchMode.TEXT:
|
||||
user_query = parse_simple_text_query(self._index, query)
|
||||
elif search_mode is SearchMode.TITLE:
|
||||
user_query = parse_simple_title_query(self._index, query)
|
||||
else:
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
# Apply permission filter if user is not None (not superuser)
|
||||
if user is not None:
|
||||
@@ -471,22 +499,9 @@ class TantivyBackend:
|
||||
searcher = self._index.searcher()
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
# Map sort fields
|
||||
sort_field_map = {
|
||||
"title": "title_sort",
|
||||
"correspondent__name": "correspondent_sort",
|
||||
"document_type__name": "type_sort",
|
||||
"created": "created",
|
||||
"added": "added",
|
||||
"modified": "modified",
|
||||
"archive_serial_number": "asn",
|
||||
"page_count": "page_count",
|
||||
"num_notes": "num_notes",
|
||||
}
|
||||
|
||||
# Perform search
|
||||
if sort_field and sort_field in sort_field_map:
|
||||
mapped_field = sort_field_map[sort_field]
|
||||
if sort_field and sort_field in SORT_FIELD_MAP:
|
||||
mapped_field = SORT_FIELD_MAP[sort_field]
|
||||
results = searcher.search(
|
||||
final_query,
|
||||
limit=offset + page_size,
|
||||
@@ -594,7 +609,7 @@ class TantivyBackend:
|
||||
List of word suggestions ordered by frequency, then alphabetically
|
||||
"""
|
||||
self._ensure_open()
|
||||
normalized_term = _ascii_fold(term.lower())
|
||||
normalized_term = ascii_fold(term.lower())
|
||||
|
||||
searcher = self._index.searcher()
|
||||
|
||||
|
||||
8
src/documents/search/_normalize.py
Normal file
8
src/documents/search/_normalize.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unicodedata
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
"""Normalize unicode text to ASCII equivalents for search consistency."""
|
||||
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()
|
||||
@@ -12,6 +12,8 @@ import tantivy
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from django.conf import settings
|
||||
|
||||
from documents.search._normalize import ascii_fold
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
|
||||
@@ -51,6 +53,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
|
||||
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
|
||||
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
@@ -436,7 +439,27 @@ DEFAULT_SEARCH_FIELDS = [
|
||||
"document_type",
|
||||
"tag",
|
||||
]
|
||||
SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"]
|
||||
TITLE_SEARCH_FIELDS = ["simple_title"]
|
||||
_FIELD_BOOSTS = {"title": 2.0}
|
||||
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
|
||||
|
||||
|
||||
def _build_simple_field_query(
|
||||
index: tantivy.Index,
|
||||
field: str,
|
||||
tokens: list[str],
|
||||
) -> tantivy.Query:
|
||||
patterns = [f".*{regex.escape(token)}.*" for token in tokens]
|
||||
if len(patterns) == 1:
|
||||
query = tantivy.Query.regex_query(index.schema, field, patterns[0])
|
||||
else:
|
||||
query = tantivy.Query.regex_phrase_query(index.schema, field, patterns)
|
||||
|
||||
boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0)
|
||||
if boost > 1.0:
|
||||
return tantivy.Query.boost_query(query, boost)
|
||||
return query
|
||||
|
||||
|
||||
def parse_user_query(
|
||||
@@ -495,3 +518,52 @@ def parse_user_query(
|
||||
)
|
||||
|
||||
return exact
|
||||
|
||||
|
||||
def parse_simple_query(
|
||||
index: tantivy.Index,
|
||||
raw_query: str,
|
||||
fields: list[str],
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Parse a plain-text query using Tantivy over a restricted field set.
|
||||
|
||||
Query string is escaped and normalized to be treated as "simple" text query.
|
||||
"""
|
||||
tokens = [
|
||||
ascii_fold(token.lower())
|
||||
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
|
||||
]
|
||||
tokens = [token for token in tokens if token]
|
||||
if not tokens:
|
||||
return tantivy.Query.empty_query()
|
||||
|
||||
field_queries = [
|
||||
(tantivy.Occur.Should, _build_simple_field_query(index, field, tokens))
|
||||
for field in fields
|
||||
]
|
||||
if len(field_queries) == 1:
|
||||
return field_queries[0][1]
|
||||
return tantivy.Query.boolean_query(field_queries)
|
||||
|
||||
|
||||
def parse_simple_text_query(
|
||||
index: tantivy.Index,
|
||||
raw_query: str,
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Parse a plain-text query over title/content for simple search inputs.
|
||||
"""
|
||||
|
||||
return parse_simple_query(index, raw_query, SIMPLE_SEARCH_FIELDS)
|
||||
|
||||
|
||||
def parse_simple_title_query(
|
||||
index: tantivy.Index,
|
||||
raw_query: str,
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Parse a plain-text query over the title field only.
|
||||
"""
|
||||
|
||||
return parse_simple_query(index, raw_query, TITLE_SEARCH_FIELDS)
|
||||
|
||||
@@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema:
|
||||
# CJK support - not stored, indexed only
|
||||
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
|
||||
|
||||
# Simple substring search support for title/content - not stored, indexed only
|
||||
sb.add_text_field(
|
||||
"simple_title",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
sb.add_text_field(
|
||||
"simple_content",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
|
||||
# Autocomplete prefix scan - stored, not indexed
|
||||
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")
|
||||
|
||||
|
||||
@@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
|
||||
# Fast-field tokenizer required for fast=True text fields in the schema
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
@@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(
|
||||
tantivy.Tokenizer.regex(r"\S+"),
|
||||
)
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
@@ -30,7 +30,6 @@ from documents.consumer import AsnCheckPlugin
|
||||
from documents.consumer import ConsumerPlugin
|
||||
from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.consumer import should_produce_archive
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.double_sided import CollatePlugin
|
||||
@@ -302,16 +301,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
try:
|
||||
produce_archive = should_produce_archive(
|
||||
parser,
|
||||
mime_type,
|
||||
document.source_path,
|
||||
)
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
parser.parse(document.source_path, mime_type)
|
||||
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.search._backend import SearchMode
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
@@ -46,6 +47,216 @@ class TestWriteBatch:
|
||||
class TestSearch:
|
||||
"""Test search functionality."""
|
||||
|
||||
def test_text_mode_limits_default_search_to_title_and_content(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode must not match metadata-only fields."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice document",
|
||||
content="monthly statement",
|
||||
checksum="TXT1",
|
||||
pk=9,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
metadata_only = backend.search(
|
||||
"document_type:invoice",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert metadata_only.total == 0
|
||||
|
||||
content_match = backend.search(
|
||||
"monthly",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert content_match.total == 1
|
||||
|
||||
def test_title_mode_limits_default_search_to_title_only(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Title mode must not match content-only terms."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice document",
|
||||
content="monthly statement",
|
||||
checksum="TXT2",
|
||||
pk=10,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
content_only = backend.search(
|
||||
"monthly",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert content_only.total == 0
|
||||
|
||||
title_match = backend.search(
|
||||
"invoice",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert title_match.total == 1
|
||||
|
||||
def test_text_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should support substring matching within tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Account access",
|
||||
content="password reset instructions",
|
||||
checksum="TXT3",
|
||||
pk=11,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo re",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_text_mode_does_not_match_on_partial_term_overlap(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should not match documents that merely share partial fragments."""
|
||||
doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="TXT7",
|
||||
pk=13,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
non_match = backend.search(
|
||||
"raptor",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert non_match.total == 0
|
||||
|
||||
def test_text_mode_ignores_queries_without_searchable_tokens(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should safely return no hits for symbol-only strings."""
|
||||
doc = Document.objects.create(
|
||||
title="Guide",
|
||||
content="This is a guide.",
|
||||
checksum="TXT8",
|
||||
pk=14,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
no_tokens = backend.search(
|
||||
"!!!",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert no_tokens.total == 0
|
||||
|
||||
def test_title_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Title mode should support substring matching within title tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Password guide",
|
||||
content="reset instructions",
|
||||
checksum="TXT4",
|
||||
pk=12,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo gu",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
|
||||
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
|
||||
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
|
||||
|
||||
@@ -8,6 +8,7 @@ import tantivy
|
||||
|
||||
from documents.search._tokenizer import _bigram_analyzer
|
||||
from documents.search._tokenizer import _paperless_text
|
||||
from documents.search._tokenizer import _simple_search_analyzer
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -41,6 +42,20 @@ class TestTokenizers:
|
||||
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def simple_search_index(self) -> tantivy.Index:
|
||||
"""Index with simple-search field for Latin substring tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field(
|
||||
"simple_content",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
|
||||
return idx
|
||||
|
||||
def test_ascii_fold_finds_accented_content(
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
@@ -66,6 +81,24 @@ class TestTokenizers:
|
||||
q = bigram_index.parse_query("東京", ["bigram_content"])
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_simple_search_analyzer_supports_regex_substrings(
|
||||
self,
|
||||
simple_search_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
|
||||
writer = simple_search_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("simple_content", "tag:invoice password-reset")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
simple_search_index.reload()
|
||||
q = tantivy.Query.regex_query(
|
||||
simple_search_index.schema,
|
||||
"simple_content",
|
||||
".*sswo.*",
|
||||
)
|
||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
@@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
"pages": None,
|
||||
"language": None,
|
||||
"mode": None,
|
||||
"archive_file_generation": None,
|
||||
"skip_archive_file": None,
|
||||
"image_dpi": None,
|
||||
"unpaper_clean": None,
|
||||
"deskew": None,
|
||||
|
||||
@@ -91,6 +91,127 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_simple_text_search(self) -> None:
|
||||
tagged = Tag.objects.create(name="invoice")
|
||||
matching_doc = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="Monthly bank report",
|
||||
checksum="T1",
|
||||
pk=11,
|
||||
)
|
||||
matching_doc.tags.add(tagged)
|
||||
|
||||
metadata_only_doc = Document.objects.create(
|
||||
title="Completely unrelated",
|
||||
content="No matching terms here",
|
||||
checksum="T2",
|
||||
pk=12,
|
||||
)
|
||||
metadata_only_doc.tags.add(tagged)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(matching_doc)
|
||||
backend.add_or_update(metadata_only_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=monthly")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=tag:invoice")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_text_search_matches_substrings(self) -> None:
|
||||
matching_doc = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="Password reset instructions",
|
||||
checksum="T5",
|
||||
pk=15,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo re")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
|
||||
non_matching_doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="T7",
|
||||
pk=17,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(non_matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=raptor")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_title_search(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="No matching content here",
|
||||
checksum="T3",
|
||||
pk=13,
|
||||
)
|
||||
content_only = Document.objects.create(
|
||||
title="Completely unrelated",
|
||||
content="Quarterly summary appears only in content",
|
||||
checksum="T4",
|
||||
pk=14,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
backend.add_or_update(content_only)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=quarterly")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_simple_title_search_matches_substrings(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Password handbook",
|
||||
content="No matching content here",
|
||||
checksum="T6",
|
||||
pk=16,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo hand")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_search_returns_all_for_api_version_9(self) -> None:
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
@@ -1493,6 +1614,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
|
||||
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
|
||||
|
||||
def test_global_search_db_only_limits_documents_to_title_matches(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="bank statement",
|
||||
content="no additional terms",
|
||||
checksum="GS1",
|
||||
pk=21,
|
||||
)
|
||||
content_only = Document.objects.create(
|
||||
title="not a title match",
|
||||
content="bank appears only in content",
|
||||
checksum="GS2",
|
||||
pk=22,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
backend.add_or_update(content_only)
|
||||
|
||||
self.client.force_authenticate(self.user)
|
||||
|
||||
response = self.client.get("/api/search/?query=bank&db_only=true")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(len(response.data["documents"]), 1)
|
||||
self.assertEqual(response.data["documents"][0]["id"], title_match.id)
|
||||
|
||||
def test_global_search_filters_owned_mail_objects(self) -> None:
|
||||
user1 = User.objects.create_user("mail-search-user")
|
||||
user2 = User.objects.create_user("other-mail-search-user")
|
||||
|
||||
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
|
||||
CONSUMER_TAG_BARCODE_SPLIT=True,
|
||||
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
||||
CELERY_TASK_ALWAYS_EAGER=True,
|
||||
OCR_MODE="auto",
|
||||
OCR_MODE="skip",
|
||||
)
|
||||
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -230,11 +230,7 @@ class TestConsumer(
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT=None,
|
||||
TIME_ZONE="America/Chicago",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
|
||||
def testNormalOperation(self) -> None:
|
||||
filename = self.get_test_file()
|
||||
|
||||
@@ -633,10 +629,7 @@ class TestConsumer(
|
||||
# Database empty
|
||||
self.assertEqual(Document.objects.all().count(), 0)
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def testFilenameHandling(self) -> None:
|
||||
with self.get_consumer(
|
||||
self.get_test_file(),
|
||||
@@ -653,7 +646,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@mock.patch("documents.consumer.generate_unique_filename")
|
||||
@override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always")
|
||||
@override_settings(FILENAME_FORMAT="{pk}")
|
||||
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
||||
m.side_effect = lambda doc, archive_filename=False: Path(
|
||||
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
||||
@@ -680,10 +673,7 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m) -> None:
|
||||
filenames = ["this", "that", "now this", "i cannot decide"]
|
||||
@@ -1031,7 +1021,7 @@ class TestConsumer(
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
@@ -1142,7 +1132,6 @@ class TestConsumer(
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
|
||||
@@ -1290,14 +1279,7 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
def test_no_pre_consume_script(self, m) -> None:
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
# Verify no pre-consume script subprocess was invoked
|
||||
# (run_subprocess may still be called by _extract_text_for_archive_check)
|
||||
script_calls = [
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
|
||||
]
|
||||
self.assertEqual(script_calls, [])
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
||||
@@ -1313,16 +1295,9 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
|
||||
self.assertTrue(m.called)
|
||||
m.assert_called_once()
|
||||
|
||||
# Find the call that invoked the pre-consume script
|
||||
# (run_subprocess may also be called by _extract_text_for_archive_check)
|
||||
script_call = next(
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] == script.name
|
||||
)
|
||||
args, _ = script_call
|
||||
args, _ = m.call_args
|
||||
|
||||
command = args[0]
|
||||
environment = args[1]
|
||||
|
||||
@@ -1,189 +0,0 @@
|
||||
"""Tests for should_produce_archive()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from documents.consumer import should_produce_archive
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
def _parser_instance(
|
||||
*,
|
||||
can_produce: bool = True,
|
||||
requires_rendition: bool = False,
|
||||
) -> MagicMock:
|
||||
"""Return a mock parser instance with the given capability flags."""
|
||||
instance = MagicMock()
|
||||
instance.can_produce_archive = can_produce
|
||||
instance.requires_pdf_rendition = requires_rendition
|
||||
return instance
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_app_config(mocker, null_app_config):
|
||||
"""Patch BaseConfig._get_config_instance for all tests in this module."""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
|
||||
class TestShouldProduceArchive:
|
||||
@pytest.mark.parametrize(
|
||||
("generation", "can_produce", "requires_rendition", "mime", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
False,
|
||||
id="never-returns-false",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="always-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-overrides-never",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
False,
|
||||
"text/plain",
|
||||
False,
|
||||
id="cannot-produce-overrides-always",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-wins-even-if-cannot-produce",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"image/tiff",
|
||||
True,
|
||||
id="auto-image-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"message/rfc822",
|
||||
False,
|
||||
id="auto-non-pdf-non-image-returns-false",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generation_setting(
|
||||
self,
|
||||
settings,
|
||||
generation: str,
|
||||
can_produce: bool, # noqa: FBT001
|
||||
requires_rendition: bool, # noqa: FBT001
|
||||
mime: str,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = generation
|
||||
parser = _parser_instance(
|
||||
can_produce=can_produce,
|
||||
requires_rendition=requires_rendition,
|
||||
)
|
||||
assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("extracted_text", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"This is a born-digital PDF with lots of text content. " * 10,
|
||||
False,
|
||||
id="born-digital-long-text-skips-archive",
|
||||
),
|
||||
pytest.param(None, True, id="no-text-scanned-produces-archive"),
|
||||
pytest.param("tiny", True, id="short-text-treated-as-scanned"),
|
||||
],
|
||||
)
|
||||
def test_auto_pdf_archive_decision(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
extracted_text: str | None,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
|
||||
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is expected
|
||||
)
|
||||
|
||||
def test_tagged_pdf_skips_archive_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is False
|
||||
)
|
||||
|
||||
def test_tagged_pdf_does_not_call_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
mock_extract.assert_not_called()
|
||||
@@ -27,10 +27,7 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def make_models(self):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -213,7 +213,6 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1995,11 +1995,23 @@ class ChatStreamingView(GenericAPIView):
|
||||
list=extend_schema(
|
||||
description="Document views including search",
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name="text",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Simple Tantivy-backed text search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="title_search",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Simple Tantivy-backed title-only search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="query",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Advanced search query string",
|
||||
description="Advanced Tantivy search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="full_perms",
|
||||
@@ -2033,7 +2045,9 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
|
||||
def _is_search_request(self):
|
||||
return (
|
||||
"query" in self.request.query_params
|
||||
"text" in self.request.query_params
|
||||
or "title_search" in self.request.query_params
|
||||
or "query" in self.request.query_params
|
||||
or "more_like_id" in self.request.query_params
|
||||
)
|
||||
|
||||
@@ -2041,26 +2055,66 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
if not self._is_search_request():
|
||||
return super().list(request)
|
||||
|
||||
from documents.search import SORT_FIELD_MAP
|
||||
from documents.search import SearchMode
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
|
||||
try:
|
||||
backend = get_backend()
|
||||
# ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied)
|
||||
# ORM queryset with field filters applied (tags, correspondent, etc.); used to
|
||||
# intersect with tantivy results. Tantivy handles permission filtering via user param.
|
||||
filtered_qs = self.filter_queryset(self.get_queryset())
|
||||
|
||||
user = None if request.user.is_superuser else request.user
|
||||
|
||||
if "query" in request.query_params:
|
||||
query_str = request.query_params["query"]
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
# Parse ordering: extract field name and direction
|
||||
raw_ordering = request.query_params.get("ordering", "")
|
||||
sort_reverse = raw_ordering.startswith("-")
|
||||
sort_field = raw_ordering.lstrip("-") or None
|
||||
# Use tantivy native sorting for indexed fields; fall back to ORM for the rest
|
||||
# (owner, storage_path__name, id, custom_field_* require ORM ordering)
|
||||
tantivy_sort_field = sort_field if sort_field in SORT_FIELD_MAP else None
|
||||
|
||||
if (
|
||||
"text" in request.query_params
|
||||
or "title_search" in request.query_params
|
||||
or "query" in request.query_params
|
||||
):
|
||||
if "text" in request.query_params:
|
||||
search_mode = SearchMode.TEXT
|
||||
query_str = request.query_params["text"]
|
||||
elif "title_search" in request.query_params:
|
||||
search_mode = SearchMode.TITLE
|
||||
query_str = request.query_params["title_search"]
|
||||
else:
|
||||
search_mode = SearchMode.QUERY
|
||||
query_str = request.query_params["query"]
|
||||
|
||||
if tantivy_sort_field:
|
||||
# Tantivy handles sorting and pagination natively
|
||||
page_size = self.paginator.get_page_size(request)
|
||||
page_num = int(request.query_params.get("page", 1))
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
page=page_num,
|
||||
page_size=page_size,
|
||||
sort_field=tantivy_sort_field,
|
||||
sort_reverse=sort_reverse,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
else:
|
||||
# Relevance or ORM-sorted fallback: fetch all hits for downstream ordering
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
else:
|
||||
# more_like_id — validate permission on the seed document first
|
||||
try:
|
||||
@@ -2084,17 +2138,44 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
page=1,
|
||||
page_size=10000,
|
||||
)
|
||||
tantivy_sort_field = None # MLT always uses relevance order
|
||||
|
||||
hits_by_id = {h["id"]: h for h in results.hits}
|
||||
|
||||
# Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order
|
||||
ordering_param = request.query_params.get("ordering", "").lstrip("-")
|
||||
if not ordering_param:
|
||||
# Preserve Tantivy relevance order; intersect with ORM-visible IDs
|
||||
if tantivy_sort_field:
|
||||
# Tantivy sorted and paginated; intersect with ORM to apply field filters
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
|
||||
serializer = self.get_serializer(ordered_hits, many=True)
|
||||
response = self.get_paginated_response(serializer.data)
|
||||
response.data["corrected_query"] = None
|
||||
response.data["count"] = results.total
|
||||
if get_boolean(
|
||||
str(request.query_params.get("include_selection_data", "false")),
|
||||
):
|
||||
# Fetch all matched IDs for selection data (separate query, ID-only)
|
||||
all_results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
all_ids = [h["id"] for h in all_results.hits]
|
||||
response.data["selection_data"] = (
|
||||
self._get_selection_data_for_queryset(
|
||||
filtered_qs.filter(pk__in=all_ids),
|
||||
)
|
||||
)
|
||||
return response
|
||||
elif not sort_field:
|
||||
# Relevance order — preserve tantivy ranking, intersect with ORM for field filters
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
|
||||
else:
|
||||
# Use ORM ordering (already applied by DocumentsOrderingFilter)
|
||||
# ORM ordering fallback for fields tantivy can't sort (owner, storage_path__name, id, custom_field_*)
|
||||
hit_ids = set(hits_by_id.keys())
|
||||
orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
|
||||
"pk",
|
||||
@@ -3003,6 +3084,9 @@ class GlobalSearchView(PassUserMixin):
|
||||
serializer_class = SearchResultSerializer
|
||||
|
||||
def get(self, request, *args, **kwargs):
|
||||
from documents.search import get_backend
|
||||
from documents.search._backend import SearchMode
|
||||
|
||||
query = request.query_params.get("query", None)
|
||||
if query is None:
|
||||
return HttpResponseBadRequest("Query required")
|
||||
@@ -3019,25 +3103,25 @@ class GlobalSearchView(PassUserMixin):
|
||||
"view_document",
|
||||
Document,
|
||||
)
|
||||
# First search by title
|
||||
docs = all_docs.filter(title__icontains=query)
|
||||
if not db_only and len(docs) < OBJECT_LIMIT:
|
||||
# If we don't have enough results, search by content.
|
||||
# Over-fetch from Tantivy (no permission filter) and rely on
|
||||
# the ORM all_docs queryset for authoritative permission gating.
|
||||
from documents.search import get_backend
|
||||
|
||||
if db_only:
|
||||
docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT]
|
||||
else:
|
||||
user = None if request.user.is_superuser else request.user
|
||||
fts_results = get_backend().search(
|
||||
query,
|
||||
user=None,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=1000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
fts_ids = {h["id"] for h in fts_results.hits}
|
||||
docs = docs | all_docs.filter(id__in=fts_ids)
|
||||
docs = docs[:OBJECT_LIMIT]
|
||||
docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits])
|
||||
docs = [
|
||||
docs_by_id[hit["id"]]
|
||||
for hit in fts_results.hits
|
||||
if hit["id"] in docs_by_id
|
||||
][:OBJECT_LIMIT]
|
||||
saved_views = (
|
||||
get_objects_for_user_owner_aware(
|
||||
request.user,
|
||||
|
||||
@@ -5,7 +5,6 @@ import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
@@ -23,7 +22,7 @@ writeable_hint = (
|
||||
)
|
||||
|
||||
|
||||
def path_check(var: str, directory: Path) -> list[Error]:
|
||||
def path_check(var, directory: Path) -> list[Error]:
|
||||
messages: list[Error] = []
|
||||
if directory:
|
||||
if not directory.is_dir():
|
||||
@@ -60,7 +59,7 @@ def path_check(var: str, directory: Path) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
"""
|
||||
Check the various paths for existence, readability and writeability
|
||||
"""
|
||||
@@ -74,7 +73,7 @@ def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
def binaries_check(app_configs, **kwargs):
|
||||
"""
|
||||
Paperless requires the existence of a few binaries, so we do some checks
|
||||
for those here.
|
||||
@@ -94,7 +93,7 @@ def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
||||
def debug_mode_check(app_configs, **kwargs):
|
||||
if settings.DEBUG:
|
||||
return [
|
||||
Warning(
|
||||
@@ -110,7 +109,7 @@ def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
||||
|
||||
|
||||
@register()
|
||||
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
|
||||
def settings_values_check(app_configs, **kwargs):
|
||||
"""
|
||||
Validates at least some of the user provided settings
|
||||
"""
|
||||
@@ -133,14 +132,23 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
|
||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||
)
|
||||
|
||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||
|
||||
if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}:
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
msgs.append(
|
||||
Warning(
|
||||
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||
"removed in a future version. Please use "
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||
),
|
||||
)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||
msgs.append(
|
||||
Error(
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION setting "
|
||||
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
|
||||
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||
),
|
||||
)
|
||||
|
||||
@@ -183,7 +191,7 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
|
||||
|
||||
|
||||
@register()
|
||||
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
def audit_log_check(app_configs, **kwargs):
|
||||
db_conn = connections["default"]
|
||||
all_tables = db_conn.introspection.table_names()
|
||||
result = []
|
||||
@@ -295,42 +303,7 @@ def check_deprecated_db_settings(
|
||||
|
||||
|
||||
@register()
|
||||
def check_deprecated_v2_ocr_env_vars(
|
||||
app_configs: object,
|
||||
**kwargs: object,
|
||||
) -> list[Warning]:
|
||||
"""Warn when deprecated v2 OCR environment variables are set.
|
||||
|
||||
Users upgrading from v2 may still have these in their environment or
|
||||
config files, where they are now silently ignored.
|
||||
"""
|
||||
warnings: list[Warning] = []
|
||||
|
||||
if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"):
|
||||
warnings.append(
|
||||
Warning(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. "
|
||||
"Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.",
|
||||
id="paperless.W002",
|
||||
),
|
||||
)
|
||||
|
||||
ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "")
|
||||
if ocr_mode in {"skip", "skip_noarchive"}:
|
||||
warnings.append(
|
||||
Warning(
|
||||
f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. "
|
||||
f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never "
|
||||
f"if you used skip_noarchive) instead.",
|
||||
id="paperless.W003",
|
||||
),
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
@@ -356,7 +329,7 @@ def get_tesseract_langs():
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
|
||||
@@ -4,11 +4,6 @@ import json
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -33,7 +28,7 @@ class OutputTypeConfig(BaseConfig):
|
||||
Almost all parsers care about the chosen PDF output format
|
||||
"""
|
||||
|
||||
output_type: OutputTypeChoices = dataclasses.field(init=False)
|
||||
output_type: str = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
app_config = self._get_config_instance()
|
||||
@@ -50,17 +45,15 @@ class OcrConfig(OutputTypeConfig):
|
||||
|
||||
pages: int | None = dataclasses.field(init=False)
|
||||
language: str = dataclasses.field(init=False)
|
||||
mode: ModeChoices = dataclasses.field(init=False)
|
||||
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
|
||||
init=False,
|
||||
)
|
||||
mode: str = dataclasses.field(init=False)
|
||||
skip_archive_file: str = dataclasses.field(init=False)
|
||||
image_dpi: int | None = dataclasses.field(init=False)
|
||||
clean: CleanChoices = dataclasses.field(init=False)
|
||||
clean: str = dataclasses.field(init=False)
|
||||
deskew: bool = dataclasses.field(init=False)
|
||||
rotate: bool = dataclasses.field(init=False)
|
||||
rotate_threshold: float = dataclasses.field(init=False)
|
||||
max_image_pixel: float | None = dataclasses.field(init=False)
|
||||
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
|
||||
color_conversion_strategy: str = dataclasses.field(init=False)
|
||||
user_args: dict[str, str] | None = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -71,8 +64,8 @@ class OcrConfig(OutputTypeConfig):
|
||||
self.pages = app_config.pages or settings.OCR_PAGES
|
||||
self.language = app_config.language or settings.OCR_LANGUAGE
|
||||
self.mode = app_config.mode or settings.OCR_MODE
|
||||
self.archive_file_generation = (
|
||||
app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION
|
||||
self.skip_archive_file = (
|
||||
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
)
|
||||
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
|
||||
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
# Generated by Django 5.2.12 on 2026-03-26 20:31
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0007_optimize_integer_field_sizes"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="applicationconfiguration",
|
||||
name="skip_archive_file",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="archive_file_generation",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("auto", "auto"), ("always", "always"), ("never", "never")],
|
||||
max_length=8,
|
||||
null=True,
|
||||
verbose_name="Controls archive file generation",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="applicationconfiguration",
|
||||
name="mode",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("auto", "auto"),
|
||||
("force", "force"),
|
||||
("redo", "redo"),
|
||||
("off", "off"),
|
||||
],
|
||||
max_length=16,
|
||||
null=True,
|
||||
verbose_name="Sets the OCR mode",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
|
||||
and our own custom setting
|
||||
"""
|
||||
|
||||
AUTO = ("auto", _("auto"))
|
||||
FORCE = ("force", _("force"))
|
||||
SKIP = ("skip", _("skip"))
|
||||
REDO = ("redo", _("redo"))
|
||||
OFF = ("off", _("off"))
|
||||
FORCE = ("force", _("force"))
|
||||
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||
|
||||
|
||||
class ArchiveFileGenerationChoices(models.TextChoices):
|
||||
class ArchiveFileChoices(models.TextChoices):
|
||||
"""
|
||||
Settings to control creation of an archive PDF file
|
||||
"""
|
||||
|
||||
AUTO = ("auto", _("auto"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
NEVER = ("never", _("never"))
|
||||
WITH_TEXT = ("with_text", _("with_text"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
|
||||
|
||||
class CleanChoices(models.TextChoices):
|
||||
@@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
choices=ModeChoices.choices,
|
||||
)
|
||||
|
||||
archive_file_generation = models.CharField(
|
||||
verbose_name=_("Controls archive file generation"),
|
||||
skip_archive_file = models.CharField(
|
||||
verbose_name=_("Controls the generation of an archive file"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=8,
|
||||
choices=ArchiveFileGenerationChoices.choices,
|
||||
max_length=16,
|
||||
choices=ArchiveFileChoices.choices,
|
||||
)
|
||||
|
||||
image_dpi = models.PositiveSmallIntegerField(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.resources
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -9,8 +8,6 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Final
|
||||
from typing import NoReturn
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
@@ -21,11 +18,9 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
@@ -38,11 +33,7 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||
|
||||
_SRGB_ICC_DATA: Final[bytes] = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
@@ -108,7 +99,7 @@ class RasterisedDocumentParser:
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object | None = None) -> None:
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
@@ -242,7 +233,7 @@ class RasterisedDocumentParser:
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != ModeChoices.REDO
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
@@ -259,7 +250,36 @@ class RasterisedDocumentParser:
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
@@ -269,7 +289,6 @@ class RasterisedDocumentParser:
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback: bool = False,
|
||||
skip_text: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
@@ -288,14 +307,15 @@ class RasterisedDocumentParser:
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
elif skip_text or self.settings.mode == ModeChoices.OFF:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
pass # no extra flag: normal OCR (text not found case)
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
@@ -380,74 +400,6 @@ class RasterisedDocumentParser:
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
|
||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
||||
|
||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
||||
PDF/A-2b conformance metadata.
|
||||
|
||||
No Tesseract and no Ghostscript are invoked.
|
||||
"""
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
|
||||
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
|
||||
try:
|
||||
layout_fun = None
|
||||
if self.settings.image_dpi is not None:
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
|
||||
(self.settings.image_dpi, self.settings.image_dpi),
|
||||
)
|
||||
plain_pdf_path.write_bytes(
|
||||
img2pdf.convert(str(document_path), layout_fun=layout_fun),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
||||
) from e
|
||||
|
||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
with pikepdf.open(plain_pdf_path) as pdf:
|
||||
cs = pdf.make_stream(_SRGB_ICC_DATA)
|
||||
cs["/N"] = 3
|
||||
output_intent = pikepdf.Dictionary(
|
||||
Type=pikepdf.Name("/OutputIntent"),
|
||||
S=pikepdf.Name("/GTS_PDFA1"),
|
||||
OutputConditionIdentifier=pikepdf.String("sRGB"),
|
||||
DestOutputProfile=cs,
|
||||
)
|
||||
pdf.Root["/OutputIntents"] = pdf.make_indirect(
|
||||
pikepdf.Array([output_intent]),
|
||||
)
|
||||
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
|
||||
meta["pdfaid:part"] = "2"
|
||||
meta["pdfaid:conformance"] = "B"
|
||||
pdf.save(pdfa_path)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
|
||||
)
|
||||
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
|
||||
|
||||
return pdfa_path
|
||||
|
||||
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
|
||||
"""Log context for Ghostscript failures and raise ParseError.
|
||||
|
||||
Called from the SubprocessOutputError handlers in parse() to avoid
|
||||
duplicating the Ghostscript hint and re-raise logic.
|
||||
"""
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -457,94 +409,57 @@ class RasterisedDocumentParser:
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
|
||||
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
||||
if self.settings.mode == ModeChoices.OFF:
|
||||
if not produce_archive:
|
||||
self.text = text_original or ""
|
||||
return
|
||||
if self.is_image(mime_type):
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
)
|
||||
self.text = ""
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"Image to PDF/A conversion failed: {e!s}",
|
||||
) from e
|
||||
return
|
||||
# PDFs in off mode: PDF/A conversion only via skip_text
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=True,
|
||||
)
|
||||
try:
|
||||
self.log.debug(
|
||||
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
|
||||
)
|
||||
ocrmypdf.ocr(**args)
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(None, archive_path) or text_original or ""
|
||||
except SubprocessOutputError as e:
|
||||
self._handle_subprocess_output_error(e)
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
return
|
||||
|
||||
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
|
||||
if (
|
||||
self.settings.mode == ModeChoices.AUTO
|
||||
and original_has_text
|
||||
and not produce_archive
|
||||
):
|
||||
self.log.debug(
|
||||
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
|
||||
)
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# --- All other paths: run ocrmypdf ---
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=skip_text,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if produce_archive:
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
@@ -559,8 +474,16 @@ class RasterisedDocumentParser:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except SubprocessOutputError as e:
|
||||
self._handle_subprocess_output_error(e)
|
||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
@@ -569,6 +492,8 @@ class RasterisedDocumentParser:
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
@@ -580,18 +505,25 @@ class RasterisedDocumentParser:
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path_fallback
|
||||
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
|
||||
@@ -10,105 +10,15 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
||||
PDF_TEXT_MIN_LENGTH: Final[int] = 50
|
||||
|
||||
|
||||
def is_tagged_pdf(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> bool:
|
||||
"""Return True if the PDF declares itself as tagged (born-digital indicator).
|
||||
|
||||
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
|
||||
with ``/Marked true`` in the document root. This is a reliable signal
|
||||
that the document has a logical structure and embedded text — running OCR
|
||||
on it is unnecessary and archive generation can be skipped.
|
||||
|
||||
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with pikepdf.open(path) as pdf:
|
||||
mark_info = pdf.Root.get("/MarkInfo")
|
||||
if mark_info is None:
|
||||
return False
|
||||
return bool(mark_info.get("/Marked", False))
|
||||
except Exception:
|
||||
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def extract_pdf_text(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str | None:
|
||||
"""Run pdftotext on *path* and return the extracted text, or None on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
|
||||
"""
|
||||
from documents.utils import run_subprocess
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
out_path = Path(tmpdir) / "text.txt"
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(path),
|
||||
str(out_path),
|
||||
],
|
||||
logger=_log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(out_path, log=_log)
|
||||
return text or None
|
||||
except Exception:
|
||||
_log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
|
||||
@@ -880,17 +880,10 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
# OCRmyPDF --output-type options are available.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
OCR_MODE = get_choice_from_env(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
{"auto", "force", "redo", "off"},
|
||||
default="auto",
|
||||
)
|
||||
# skip. redo, force
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
|
||||
ARCHIVE_FILE_GENERATION = get_choice_from_env(
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION",
|
||||
{"auto", "always", "never"},
|
||||
default="auto",
|
||||
)
|
||||
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||
|
||||
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
|
||||
@@ -708,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
skip_archive_file=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
|
||||
@@ -1,436 +0,0 @@
|
||||
"""
|
||||
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
|
||||
|
||||
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
|
||||
installation and execute quickly. The intent is to verify the *control flow*
|
||||
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
|
||||
not to test OCRmyPDF itself.
|
||||
|
||||
Fixtures are pulled from conftest.py in this package.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
|
||||
_SHORT_TEXT = "Hi." # <50 chars
|
||||
|
||||
|
||||
def _make_extract_text(text: str | None):
|
||||
"""Return a side_effect function for ``extract_text`` that returns *text*."""
|
||||
|
||||
def _extract(sidecar_file, pdf_file):
|
||||
return text
|
||||
|
||||
return _extract
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF with sufficient text layer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeWithText:
|
||||
"""AUTO mode, original PDF has detectable text (>50 chars)."""
|
||||
|
||||
def test_auto_text_no_archive_skips_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called (early return path)
|
||||
- archive_path remains None
|
||||
- text is set from the original
|
||||
"""
|
||||
# Patch extract_text to return long text (simulating detectable text layer)
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF without text layer (or too short)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeNoText:
|
||||
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
|
||||
|
||||
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with no text (or text <= VALID_TEXT_LENGTH)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
|
||||
- archive_path is set (since produce_archive=True)
|
||||
"""
|
||||
# Return "no text" for the original; return real text for archive
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None # original has no text
|
||||
return _LONG_TEXT # text from archive after OCR
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert "skip_text" not in call_kwargs
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
def test_auto_no_text_no_archive_calls_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with no text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called (no early return since no text detected)
|
||||
- archive_path is NOT set (produce_archive=False)
|
||||
"""
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None
|
||||
return _LONG_TEXT
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModePdf:
|
||||
"""OCR_MODE=off, document is a PDF."""
|
||||
|
||||
def test_off_no_archive_returns_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- PDF with text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text comes from pdftotext (extract_text)
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_off_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- PDF document
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — image
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModeImage:
|
||||
"""OCR_MODE=off, document is an image (PNG)."""
|
||||
|
||||
def test_off_image_no_archive_no_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text is empty string (images have no text layer)
|
||||
"""
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
def test_off_image_with_archive_uses_img2pdf_path(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
|
||||
- archive_path is set to the returned path
|
||||
- text is empty string
|
||||
"""
|
||||
fake_archive = Path("/tmp/fake-archive.pdf")
|
||||
mock_convert = mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"_convert_image_to_pdfa",
|
||||
return_value=fake_archive,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
||||
|
||||
mock_convert.assert_called_once_with(simple_png_file)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path == fake_archive
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProduceArchiveFalse:
|
||||
"""Verify produce_archive=False never results in an archive regardless of mode."""
|
||||
|
||||
@pytest.mark.parametrize("mode", ["force", "redo"])
|
||||
def test_produce_archive_false_force_redo_modes(
|
||||
self,
|
||||
mode: str,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- FORCE or REDO mode, produce_archive=False
|
||||
- Any PDF
|
||||
WHEN:
|
||||
- parse() is called (ocrmypdf mocked to succeed)
|
||||
THEN:
|
||||
- archive_path is NOT set even though ocrmypdf ran
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = mode
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() is not None
|
||||
|
||||
def test_produce_archive_false_auto_with_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf is skipped entirely (early return)
|
||||
- archive_path is None
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
@@ -94,35 +94,15 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
WHEN:
|
||||
- OCR parameters are constructed
|
||||
THEN:
|
||||
- Configuration from database is utilized (AUTO mode with skip_text=True
|
||||
triggers skip_text; AUTO mode alone does not add any extra flag)
|
||||
- Configuration from database is utilized
|
||||
"""
|
||||
# AUTO mode with skip_text=True explicitly passed: skip_text is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
skip_text=True,
|
||||
)
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
# AUTO mode alone (no skip_text): no extra OCR flag is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertNotIn("skip_text", params)
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
|
||||
@@ -370,26 +370,15 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page digital PDF with sufficient text layer
|
||||
- Default settings (mode=auto, produce_archive=True)
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Archive is created (AUTO mode + text present + produce_archive=True
|
||||
→ PDF/A conversion via skip_text)
|
||||
- Text is extracted
|
||||
"""
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.archive_path.is_file()
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
tesseract_parser.get_text(),
|
||||
["This is a test document."],
|
||||
)
|
||||
|
||||
def test_with_form_default(
|
||||
@@ -408,7 +397,7 @@ class TestParsePdf:
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
def test_with_form_redo_no_archive_when_not_requested(
|
||||
def test_with_form_redo_produces_no_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
@@ -417,7 +406,6 @@ class TestParsePdf:
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "with-form.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -445,7 +433,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -461,7 +449,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "encrypted.pdf",
|
||||
"application/pdf",
|
||||
@@ -571,7 +559,7 @@ class TestParseMultiPage:
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[
|
||||
pytest.param("auto", id="auto"),
|
||||
pytest.param("skip", id="skip"),
|
||||
pytest.param("redo", id="redo"),
|
||||
pytest.param("force", id="force"),
|
||||
],
|
||||
@@ -599,7 +587,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -747,18 +735,16 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: auto, produce_archive=False
|
||||
- Mode: skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted from original; no archive created (text exists +
|
||||
produce_archive=False skips OCRmyPDF entirely)
|
||||
- Text extracted; no archive created
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -774,13 +760,13 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
- Mode: skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; archive created (OCR needed, no existing text)
|
||||
- Text extracted; archive created (OCR needed)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -792,58 +778,41 @@ class TestSkipArchive:
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("produce_archive", "filename", "expect_archive"),
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||
pytest.param(
|
||||
True,
|
||||
"multi-page-digital.pdf",
|
||||
True,
|
||||
id="produce-archive-with-text",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="produce-archive-no-text",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"with_text",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="no-archive-with-text-layer",
|
||||
id="with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"with_text",
|
||||
"multi-page-images.pdf",
|
||||
False,
|
||||
id="no-archive-no-text-layer",
|
||||
True,
|
||||
id="with-text-no-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="always-with-text",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||
],
|
||||
)
|
||||
def test_produce_archive_flag(
|
||||
def test_skip_archive_file_setting(
|
||||
self,
|
||||
produce_archive: bool, # noqa: FBT001
|
||||
skip_archive_file: str,
|
||||
filename: str,
|
||||
expect_archive: bool, # noqa: FBT001
|
||||
expect_archive: str,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Various PDFs (with and without text layers)
|
||||
- produce_archive flag set to True or False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- archive_path is set if and only if produce_archive=True
|
||||
- Text is always extracted
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / filename,
|
||||
"application/pdf",
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
tesseract_parser.settings.skip_archive_file = skip_archive_file
|
||||
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||
text = tesseract_parser.get_text().lower()
|
||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||
if expect_archive:
|
||||
@@ -851,59 +820,6 @@ class TestSkipArchive:
|
||||
else:
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
def test_tagged_pdf_skips_ocr_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
|
||||
- Text is extracted from the original via pdftotext
|
||||
- No archive is produced
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
|
||||
- Archive is produced
|
||||
- Text is preserved from the original
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse — mixed pages / sidecar
|
||||
@@ -919,13 +835,13 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in some pages (image) and some pages (digital)
|
||||
- Mode: auto (skip_text), skip_archive_file: always
|
||||
- Mode: skip
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- All pages extracted; archive created; sidecar notes skipped pages
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -982,18 +898,17 @@ class TestParseMixed:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages (some with text, some image-only)
|
||||
- Mode: auto, produce_archive=False
|
||||
- File with mixed pages
|
||||
- Mode: skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (produce_archive=False); text from text layer present
|
||||
- No archive created (file has text layer); later-page text present
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -1008,12 +923,12 @@ class TestParseMixed:
|
||||
|
||||
|
||||
class TestParseRotate:
|
||||
def test_rotate_auto_mode(
|
||||
def test_rotate_skip_mode(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.rotate = True
|
||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||
assert_ordered_substrings(
|
||||
@@ -1040,19 +955,12 @@ class TestParseRtl:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF with RTL Arabic text in its text layer (short: 18 chars)
|
||||
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
|
||||
- PDF with RTL Arabic text
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Arabic content is extracted from the PDF text layer (normalised for bidi)
|
||||
|
||||
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
|
||||
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
|
||||
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
|
||||
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
|
||||
- Arabic content is extracted (normalised for bidi)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "rtl-test.pdf",
|
||||
"application/pdf",
|
||||
@@ -1115,11 +1023,11 @@ class TestOcrmypdfParameters:
|
||||
assert ("clean" in params) == expected_clean
|
||||
assert ("clean_final" in params) == expected_clean_final
|
||||
|
||||
def test_clean_final_auto_mode(
|
||||
def test_clean_final_skip_mode(
|
||||
self,
|
||||
make_tesseract_parser: MakeTesseractParser,
|
||||
) -> None:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
assert params["clean_final"] is True
|
||||
assert "clean" not in params
|
||||
@@ -1136,9 +1044,9 @@ class TestOcrmypdfParameters:
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
||||
[
|
||||
pytest.param("auto", True, True, id="auto-deskew-on"),
|
||||
pytest.param("skip", True, True, id="skip-deskew-on"),
|
||||
pytest.param("redo", True, False, id="redo-deskew-off"),
|
||||
pytest.param("auto", False, False, id="auto-no-deskew"),
|
||||
pytest.param("skip", False, False, id="skip-no-deskew"),
|
||||
],
|
||||
)
|
||||
def test_deskew_option(
|
||||
|
||||
@@ -132,13 +132,13 @@ class TestOcrSettingsChecks:
|
||||
pytest.param(
|
||||
"OCR_MODE",
|
||||
"skip_noarchive",
|
||||
'OCR output mode "skip_noarchive"',
|
||||
id="deprecated-mode-now-invalid",
|
||||
"deprecated",
|
||||
id="deprecated-mode",
|
||||
),
|
||||
pytest.param(
|
||||
"ARCHIVE_FILE_GENERATION",
|
||||
"OCR_SKIP_ARCHIVE_FILE",
|
||||
"invalid",
|
||||
'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"',
|
||||
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
|
||||
id="invalid-skip-archive-file",
|
||||
),
|
||||
pytest.param(
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
"""Tests for v3 system checks: deprecated v2 OCR env var warnings."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.checks import check_deprecated_v2_ocr_env_vars
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
class TestDeprecatedV2OcrEnvVarWarnings:
|
||||
def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None:
|
||||
"""No warnings when neither deprecated variable is set."""
|
||||
mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("env_var", "env_value", "expected_id", "expected_fragment"),
|
||||
[
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
"always",
|
||||
"paperless.W002",
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
id="skip-archive-file-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip",
|
||||
"paperless.W003",
|
||||
"skip",
|
||||
id="ocr-mode-skip-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"paperless.W003",
|
||||
"skip_noarchive",
|
||||
id="ocr-mode-skip-noarchive-warns",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_deprecated_var_produces_one_warning(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
env_var: str,
|
||||
env_value: str,
|
||||
expected_id: str,
|
||||
expected_fragment: str,
|
||||
) -> None:
|
||||
"""Each deprecated setting in isolation produces exactly one warning."""
|
||||
mocker.patch.dict(os.environ, {env_var: env_value}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
|
||||
assert len(result) == 1
|
||||
warning = result[0]
|
||||
assert warning.id == expected_id
|
||||
assert expected_fragment in warning.msg
|
||||
@@ -1,66 +0,0 @@
|
||||
"""Tests for OcrConfig archive_file_generation field behavior."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless.config import OcrConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_ocr_config(mocker, null_app_config):
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
def _make(**django_settings_overrides):
|
||||
with override_settings(**django_settings_overrides):
|
||||
return OcrConfig()
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
class TestOcrConfigArchiveFileGeneration:
|
||||
def test_auto_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto")
|
||||
assert cfg.archive_file_generation == "auto"
|
||||
|
||||
def test_always_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "always"
|
||||
|
||||
def test_never_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
|
||||
def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None:
|
||||
null_app_config.archive_file_generation = "never"
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
@@ -1,25 +0,0 @@
|
||||
"""Tests for paperless.parsers.utils helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
||||
|
||||
|
||||
class TestIsTaggedPdf:
|
||||
def test_tagged_pdf_returns_true(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
||||
|
||||
def test_untagged_pdf_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
|
||||
|
||||
def test_nonexistent_path_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
|
||||
|
||||
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
|
||||
bad = tmp_path / "bad.pdf"
|
||||
bad.write_bytes(b"not a pdf")
|
||||
assert is_tagged_pdf(bad) is False
|
||||
Reference in New Issue
Block a user