Compare commits

..

28 Commits

Author SHA1 Message Date
Trenton H
8a2ae2059c Plays around with tantivy handling the ordering, for fields we index at least 2026-04-02 15:11:11 -07:00
shamoon
576adad9ae Last frontend coverage, good to do actually 2026-04-02 13:46:31 -07:00
shamoon
50de8b5721 Fix sonar thing, 1 more coverage line 2026-04-02 13:42:53 -07:00
shamoon
44ef454065 Centralize/de-dupe this const 2026-04-02 13:30:14 -07:00
shamoon
b3851c8bc5 Fix frontend coverage 2026-04-02 13:21:03 -07:00
shamoon
fc950a27ca Add deprecated filter logging 2026-04-02 13:19:50 -07:00
shamoon
068188f549 Drop backwards migration 2026-04-02 13:14:25 -07:00
shamoon
9248733ed1 De-dupe this 2026-04-02 13:11:15 -07:00
shamoon
c16d23fe21 Document TITLE search mode in TantivyBackend 2026-04-02 13:11:14 -07:00
shamoon
134b758c84 Rename migration 2026-04-02 13:11:14 -07:00
shamoon
f40442c61e Update filter-editor.component.ts 2026-04-02 13:11:13 -07:00
shamoon
5adfa49d00 Update API docs 2026-04-02 13:11:13 -07:00
shamoon
c06687d070 Switch simple substring search to simple_search analyzer 2026-04-02 13:11:12 -07:00
shamoon
8c737f41c0 Fix e2e 2026-04-02 13:11:12 -07:00
shamoon
20d43936b3 Just moving these comments 2026-04-02 13:11:12 -07:00
shamoon
87728f3448 Fix this one failing test 2026-04-02 13:11:11 -07:00
shamoon
804b5ed99d Update all these uses of FILTER_TITLE_CONTENT 2026-04-02 13:11:11 -07:00
shamoon
f13f6a132c Update the filter editor too 2026-04-02 13:11:10 -07:00
shamoon
66ad082f8e Bring in the new filter type to frontend 2026-04-02 13:11:10 -07:00
shamoon
357f462e82 Ok make it a proper filter type 2026-04-02 13:11:09 -07:00
shamoon
b40721ae41 Add a couple deprecation notes 2026-04-02 13:11:09 -07:00
shamoon
75afbd1a3c Drop the custom fields text query option, but dont break existing views 2026-04-02 13:11:09 -07:00
shamoon
9df15416dc Backend tests 2026-04-02 13:11:08 -07:00
shamoon
0ff1a7809e Use tantivy for global search too 2026-04-02 13:11:08 -07:00
shamoon
32dbb2438b Handle simple searches with frontend query param parsing 2026-04-02 13:11:07 -07:00
shamoon
442d049a57 Wire the simple searches to view 2026-04-02 13:11:07 -07:00
shamoon
e8c39e83fc Add a simple title query 2026-04-02 13:11:06 -07:00
shamoon
e6a334878c Add simple text search mode and API param 2026-04-02 13:11:06 -07:00
57 changed files with 1251 additions and 1670 deletions

1
.gitignore vendored
View File

@@ -111,4 +111,3 @@ celerybeat-schedule*
# ignore pnpm package store folder created when setting up the devcontainer
.pnpm-store/
.worktrees

View File

@@ -62,10 +62,14 @@ The REST api provides five different forms of authentication.
## Searching for documents
Full text searching is available on the `/api/documents/` endpoint. Two
specific query parameters cause the API to return full text search
Full text searching is available on the `/api/documents/` endpoint. The
following query parameters cause the API to return Tantivy-backed search
results:
- `/api/documents/?text=your%20search%20query`: Search title and content
using simple substring-style search.
- `/api/documents/?title_search=your%20search%20query`: Search title only
using simple substring-style search.
- `/api/documents/?query=your%20search%20query`: Search for a document
using a full text query. For details on the syntax, see [Basic Usage - Searching](usage.md#basic-usage_searching).
- `/api/documents/?more_like_id=1234`: Search for documents similar to
@@ -439,3 +443,5 @@ Initial API version.
- The `all` parameter of list endpoints is now deprecated and will be removed in a future version.
- The bulk edit objects endpoint now supports `all` and `filters` parameters to avoid having to send
large lists of object IDs for operations affecting many objects.
- The legacy `title_content` document search parameter is deprecated and will be removed in a future version.
Clients should use `text` for simple title-and-content search and `title_search` for title-only search.

View File

@@ -801,14 +801,11 @@ parsing documents.
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
: Tell paperless when and how to perform ocr on your documents. Four
: Tell paperless when and how to perform ocr on your documents. Three
modes are available:
- `auto` (default): Paperless detects whether a document already
has embedded text via pdftotext. If sufficient text is found,
OCR is skipped for that document (`--skip-text`). If no text is
present, OCR runs normally. This is the safest option for mixed
document collections.
- `skip`: Paperless skips all pages and will perform ocr only on
pages where no text is present. This is the safest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
@@ -826,59 +823,24 @@ modes are available:
significantly larger and text won't appear as sharp when zoomed
in.
- `off`: Paperless never invokes the OCR engine. For PDFs, text
is extracted via pdftotext only. For image documents, text will
be empty. Archive file generation still works via format
conversion (no Tesseract or Ghostscript required).
The default is `skip`, which only performs OCR when necessary and
always creates archived documents.
The default is `auto`.
For the `skip`, `redo`, and `force` modes, read more about OCR
behaviour in the [OCRmyPDF
Read more about this in the [OCRmyPDF
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
: Controls when paperless creates a PDF/A archive version of your
documents. Archive files are stored alongside the original and are used
for display in the web interface.
: Specify when you would like paperless to skip creating an archived
version of your documents. This is useful if you don't want to have two
almost-identical versions of your documents in the media folder.
- `auto` (default): Produce archives for scanned or image-based
documents. Skip archive generation for born-digital PDFs that
already contain embedded text. This is the recommended setting
for mixed document collections.
- `always`: Always produce a PDF/A archive when the parser
supports it, regardless of whether the document already has
text.
- `never`: Never produce an archive. Only the original file is
stored. Saves disk space but the web viewer will display the
original file directly.
- `never`: Never skip creating an archived version.
- `with_text`: Skip creating an archived version for documents
that already have embedded text.
- `always`: Always skip creating an archived version.
**Behaviour by file type and mode** (`auto` column shows the default):
| Document type | `never` | `auto` (default) | `always` |
| -------------------------- | ------- | -------------------------- | -------- |
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
| Plain text, email, HTML | No | No | No |
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |
\* Tika always produces a PDF rendition for display; this counts as
the archive regardless of the setting.
!!! note
This setting applies to the built-in Tesseract parser. Parsers
that must always convert documents to PDF for display (e.g. DOCX,
ODT via Tika) will produce a PDF regardless of this setting.
!!! note
The **remote OCR parser** (Azure AI) always produces a searchable
PDF and stores it as the archive copy, regardless of this setting.
`ARCHIVE_FILE_GENERATION=never` has no effect when the remote
parser handles a document.
The default is `never`.
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}

View File

@@ -104,64 +104,7 @@ Multiple options are combined in a single value:
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
```
## OCR and Archive File Generation Settings
The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — old values are not silently honoured; a startup warning is logged if any removed variable is still set in your environment.
### Removed settings
| Removed Setting | Replacement |
| ------------------------------------------- | --------------------------------------------------------------------- |
| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) |
| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` |
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) |
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
### What changed and why
Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives.
The new settings are independent:
- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`.
- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`.
### Action Required
Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly:
```bash
# v2: skip OCR when text present, always archive
PAPERLESS_OCR_MODE=skip
# v3: equivalent (auto is the new default)
# No change needed — auto is the default
# v2: skip OCR when text present, skip archive too
PAPERLESS_OCR_MODE=skip_noarchive
# v3: equivalent
PAPERLESS_OCR_MODE=auto
PAPERLESS_ARCHIVE_FILE_GENERATION=never
# v2: always skip archive
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always
# v3: equivalent
PAPERLESS_ARCHIVE_FILE_GENERATION=never
# v2: skip archive only for born-digital docs
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text
# v3: equivalent (auto is the new default)
PAPERLESS_ARCHIVE_FILE_GENERATION=auto
```
### Remote OCR parser
If you use the **remote OCR parser** (Azure AI), note that it always produces a
searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never`
has no effect for documents handled by the remote parser — the archive is produced
unconditionally by the remote engine.
# Search Index (Whoosh -> Tantivy)
## Search Index (Whoosh -> Tantivy)
The full-text search backend has been replaced with [Tantivy](https://github.com/quickwit-oss/tantivy).
The index format is incompatible with Whoosh, so **the search index is automatically rebuilt from

View File

@@ -633,11 +633,12 @@ hardware, but a few settings can improve performance:
consumption, so you might want to lower these settings (example: 2
workers and 1 thread to always have some computing power left for
other tasks).
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider
OCRing your documents before feeding them into Paperless. Some
scanners are able to do this!
- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive
file generation entirely, saving disk space at the cost of in-browser PDF/A viewing.
- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive
file generation for already OCRed documents, or `always` to skip it
for all documents.
- If you want to perform OCR on the device, consider using
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
less memory at the expense of slightly worse OCR results.

View File

@@ -134,9 +134,9 @@ following operations on your documents:
!!! tip
This process can be configured to fit your needs. If you don't want
paperless to create archived versions for born-digital documents, set
[`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION)
(the default). To skip archives entirely, use `never`. Please read the
paperless to create archived versions for digital documents, you can
configure that by configuring
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
[relevant section in the documentation](configuration.md#ocr).
!!! note

View File

@@ -49,11 +49,11 @@ test('text filtering', async ({ page }) => {
await page.getByRole('main').getByRole('combobox').click()
await page.getByRole('main').getByRole('combobox').fill('test')
await expect(page.locator('pngx-document-list')).toHaveText(/32 documents/)
await expect(page).toHaveURL(/title_content=test/)
await expect(page).toHaveURL(/text=test/)
await page.getByRole('button', { name: 'Title & content' }).click()
await page.getByRole('button', { name: 'Title', exact: true }).click()
await expect(page.locator('pngx-document-list')).toHaveText(/9 documents/)
await expect(page).toHaveURL(/title__icontains=test/)
await expect(page).toHaveURL(/title_search=test/)
await page.getByRole('button', { name: 'Title', exact: true }).click()
await page.getByRole('button', { name: 'Advanced search' }).click()
await expect(page).toHaveURL(/query=test/)

View File

@@ -3545,7 +3545,7 @@
"time": 1.091,
"request": {
"method": "GET",
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_content=test",
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&text=test",
"httpVersion": "HTTP/1.1",
"cookies": [],
"headers": [
@@ -3579,7 +3579,7 @@
"value": "true"
},
{
"name": "title_content",
"name": "text",
"value": "test"
}
],
@@ -4303,7 +4303,7 @@
"time": 0.603,
"request": {
"method": "GET",
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title__icontains=test",
"url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_search=test",
"httpVersion": "HTTP/1.1",
"cookies": [],
"headers": [
@@ -4337,7 +4337,7 @@
"value": "true"
},
{
"name": "title__icontains",
"name": "title_search",
"value": "test"
}
],

View File

@@ -24,7 +24,7 @@ import {
FILTER_HAS_DOCUMENT_TYPE_ANY,
FILTER_HAS_STORAGE_PATH_ANY,
FILTER_HAS_TAGS_ALL,
FILTER_TITLE_CONTENT,
FILTER_SIMPLE_TEXT,
} from 'src/app/data/filter-rule-type'
import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings'
import { DocumentListViewService } from 'src/app/services/document-list-view.service'
@@ -545,7 +545,7 @@ describe('GlobalSearchComponent', () => {
component.query = 'test'
component.runFullSearch()
expect(qfSpy).toHaveBeenCalledWith([
{ rule_type: FILTER_TITLE_CONTENT, value: 'test' },
{ rule_type: FILTER_SIMPLE_TEXT, value: 'test' },
])
settingsService.set(

View File

@@ -25,7 +25,7 @@ import {
FILTER_HAS_DOCUMENT_TYPE_ANY,
FILTER_HAS_STORAGE_PATH_ANY,
FILTER_HAS_TAGS_ALL,
FILTER_TITLE_CONTENT,
FILTER_SIMPLE_TEXT,
} from 'src/app/data/filter-rule-type'
import { ObjectWithId } from 'src/app/data/object-with-id'
import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings'
@@ -410,7 +410,7 @@ export class GlobalSearchComponent implements OnInit {
public runFullSearch() {
const ruleType = this.useAdvancedForFullSearch
? FILTER_FULLTEXT_QUERY
: FILTER_TITLE_CONTENT
: FILTER_SIMPLE_TEXT
this.documentService.searchQuery = this.useAdvancedForFullSearch
? this.query
: ''

View File

@@ -4,7 +4,7 @@ import { ComponentFixture, TestBed } from '@angular/core/testing'
import { By } from '@angular/platform-browser'
import { NgbAccordionButton, NgbActiveModal } from '@ng-bootstrap/ng-bootstrap'
import { of, throwError } from 'rxjs'
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
import { DocumentService } from 'src/app/services/rest/document.service'
import { StoragePathService } from 'src/app/services/rest/storage-path.service'
import { SettingsService } from 'src/app/services/settings.service'
@@ -105,7 +105,7 @@ describe('StoragePathEditDialogComponent', () => {
null,
'created',
true,
[{ rule_type: FILTER_TITLE, value: 'bar' }],
[{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }],
{ truncate_content: true }
)
listSpy.mockReturnValueOnce(

View File

@@ -23,7 +23,7 @@ import {
} from 'rxjs'
import { EditDialogComponent } from 'src/app/components/common/edit-dialog/edit-dialog.component'
import { Document } from 'src/app/data/document'
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
import { DEFAULT_MATCHING_ALGORITHM } from 'src/app/data/matching-model'
import { StoragePath } from 'src/app/data/storage-path'
import { IfOwnerDirective } from 'src/app/directives/if-owner.directive'
@@ -146,7 +146,7 @@ export class StoragePathEditDialogComponent
null,
'created',
true,
[{ rule_type: FILTER_TITLE, value: title }],
[{ rule_type: FILTER_SIMPLE_TITLE, value: title }],
{ truncate_content: true }
)
.pipe(

View File

@@ -3,7 +3,7 @@ import { provideHttpClientTesting } from '@angular/common/http/testing'
import { ComponentFixture, TestBed } from '@angular/core/testing'
import { NG_VALUE_ACCESSOR } from '@angular/forms'
import { of, throwError } from 'rxjs'
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
import { DocumentService } from 'src/app/services/rest/document.service'
import { DocumentLinkComponent } from './document-link.component'
@@ -99,7 +99,7 @@ describe('DocumentLinkComponent', () => {
null,
'created',
true,
[{ rule_type: FILTER_TITLE, value: 'bar' }],
[{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }],
{ truncate_content: true }
)
listSpy.mockReturnValueOnce(throwError(() => new Error()))

View File

@@ -28,7 +28,7 @@ import {
tap,
} from 'rxjs'
import { Document } from 'src/app/data/document'
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
import { DocumentService } from 'src/app/services/rest/document.service'
import { AbstractInputComponent } from '../abstract-input'
@@ -121,7 +121,7 @@ export class DocumentLinkComponent
null,
'created',
true,
[{ rule_type: FILTER_TITLE, value: title }],
[{ rule_type: FILTER_SIMPLE_TITLE, value: title }],
{ truncate_content: true }
)
.pipe(

View File

@@ -428,7 +428,7 @@ describe('BulkEditorComponent', () => {
req.flush(true)
expect(req.request.body).toEqual({
all: true,
filters: { title__icontains: 'apple' },
filters: { title_search: 'apple' },
method: 'modify_tags',
parameters: { add_tags: [101], remove_tags: [] },
})

View File

@@ -67,6 +67,8 @@ import {
FILTER_OWNER_DOES_NOT_INCLUDE,
FILTER_OWNER_ISNULL,
FILTER_SHARED_BY_USER,
FILTER_SIMPLE_TEXT,
FILTER_SIMPLE_TITLE,
FILTER_STORAGE_PATH,
FILTER_TITLE,
FILTER_TITLE_CONTENT,
@@ -312,7 +314,7 @@ describe('FilterEditorComponent', () => {
expect(component.textFilter).toEqual(null)
component.filterRules = [
{
rule_type: FILTER_TITLE_CONTENT,
rule_type: FILTER_SIMPLE_TEXT,
value: 'foo',
},
]
@@ -320,6 +322,18 @@ describe('FilterEditorComponent', () => {
expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT
}))
it('should ingest legacy text filter rules for doc title + content', fakeAsync(() => {
expect(component.textFilter).toEqual(null)
component.filterRules = [
{
rule_type: FILTER_TITLE_CONTENT,
value: 'legacy foo',
},
]
expect(component.textFilter).toEqual('legacy foo')
expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT
}))
it('should ingest text filter rules for doc asn', fakeAsync(() => {
expect(component.textFilter).toEqual(null)
component.filterRules = [
@@ -1117,7 +1131,7 @@ describe('FilterEditorComponent', () => {
expect(component.textFilter).toEqual('foo')
expect(component.filterRules).toEqual([
{
rule_type: FILTER_TITLE_CONTENT,
rule_type: FILTER_SIMPLE_TEXT,
value: 'foo',
},
])
@@ -1136,7 +1150,7 @@ describe('FilterEditorComponent', () => {
expect(component.textFilterTarget).toEqual('title')
expect(component.filterRules).toEqual([
{
rule_type: FILTER_TITLE,
rule_type: FILTER_SIMPLE_TITLE,
value: 'foo',
},
])
@@ -1250,30 +1264,12 @@ describe('FilterEditorComponent', () => {
])
}))
it('should convert user input to correct filter rules on custom fields query', fakeAsync(() => {
component.textFilterInput.nativeElement.value = 'foo'
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
const textFieldTargetDropdown = fixture.debugElement.queryAll(
By.directive(NgbDropdownItem)
)[3]
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_CUSTOM_FIELDS
fixture.detectChanges()
tick(400)
expect(component.textFilterTarget).toEqual('custom-fields')
expect(component.filterRules).toEqual([
{
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
value: 'foo',
},
])
}))
it('should convert user input to correct filter rules on mime type', fakeAsync(() => {
component.textFilterInput.nativeElement.value = 'pdf'
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
const textFieldTargetDropdown = fixture.debugElement.queryAll(
By.directive(NgbDropdownItem)
)[4]
)[3]
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_MIME_TYPE
fixture.detectChanges()
tick(400)
@@ -1291,8 +1287,8 @@ describe('FilterEditorComponent', () => {
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
const textFieldTargetDropdown = fixture.debugElement.queryAll(
By.directive(NgbDropdownItem)
)[5]
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_ASN
)[4]
textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_FULLTEXT_QUERY
fixture.detectChanges()
tick(400)
expect(component.textFilterTarget).toEqual('fulltext-query')
@@ -1696,12 +1692,56 @@ describe('FilterEditorComponent', () => {
])
}))
it('should convert legacy title filters into full text query when adding a created relative date', fakeAsync(() => {
component.filterRules = [
{
rule_type: FILTER_TITLE,
value: 'foo',
},
]
const dateCreatedDropdown = fixture.debugElement.queryAll(
By.directive(DatesDropdownComponent)
)[0]
component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK
dateCreatedDropdown.triggerEventHandler('datesSet')
fixture.detectChanges()
tick(400)
expect(component.filterRules).toEqual([
{
rule_type: FILTER_FULLTEXT_QUERY,
value: 'foo,created:[-1 week to now]',
},
])
}))
it('should convert simple title filters into full text query when adding a created relative date', fakeAsync(() => {
component.filterRules = [
{
rule_type: FILTER_SIMPLE_TITLE,
value: 'foo',
},
]
const dateCreatedDropdown = fixture.debugElement.queryAll(
By.directive(DatesDropdownComponent)
)[0]
component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK
dateCreatedDropdown.triggerEventHandler('datesSet')
fixture.detectChanges()
tick(400)
expect(component.filterRules).toEqual([
{
rule_type: FILTER_FULLTEXT_QUERY,
value: 'foo,created:[-1 week to now]',
},
])
}))
it('should leave relative dates not in quick list intact', fakeAsync(() => {
component.textFilterInput.nativeElement.value = 'created:[-2 week to now]'
component.textFilterInput.nativeElement.dispatchEvent(new Event('input'))
const textFieldTargetDropdown = fixture.debugElement.queryAll(
By.directive(NgbDropdownItem)
)[5]
)[4]
textFieldTargetDropdown.triggerEventHandler('click')
fixture.detectChanges()
tick(400)
@@ -2031,12 +2071,30 @@ describe('FilterEditorComponent', () => {
component.filterRules = [
{
rule_type: FILTER_TITLE,
rule_type: FILTER_SIMPLE_TITLE,
value: 'foo',
},
]
expect(component.generateFilterName()).toEqual('Title: foo')
component.filterRules = [
{
rule_type: FILTER_TITLE_CONTENT,
value: 'legacy foo',
},
]
expect(component.generateFilterName()).toEqual(
'Title & content: legacy foo'
)
component.filterRules = [
{
rule_type: FILTER_SIMPLE_TEXT,
value: 'foo',
},
]
expect(component.generateFilterName()).toEqual('Title & content: foo')
component.filterRules = [
{
rule_type: FILTER_ASN,
@@ -2156,6 +2214,36 @@ describe('FilterEditorComponent', () => {
})
})
it('should hide deprecated custom fields target from default text filter targets', () => {
expect(component.textFilterTargets).not.toContainEqual({
id: 'custom-fields',
name: $localize`Custom fields (Deprecated)`,
})
})
it('should keep deprecated custom fields target available for legacy filters', fakeAsync(() => {
component.filterRules = [
{
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
value: 'foo',
},
]
fixture.detectChanges()
tick()
expect(component.textFilterTarget).toEqual('custom-fields')
expect(component.textFilterTargets).toContainEqual({
id: 'custom-fields',
name: $localize`Custom fields (Deprecated)`,
})
expect(component.filterRules).toEqual([
{
rule_type: FILTER_CUSTOM_FIELDS_TEXT,
value: 'foo',
},
])
}))
it('should call autocomplete endpoint on input', fakeAsync(() => {
component.textFilterTarget = 'fulltext-query' // TEXT_FILTER_TARGET_FULLTEXT_QUERY
const autocompleteSpy = jest.spyOn(searchService, 'autocomplete')

View File

@@ -71,6 +71,8 @@ import {
FILTER_OWNER_DOES_NOT_INCLUDE,
FILTER_OWNER_ISNULL,
FILTER_SHARED_BY_USER,
FILTER_SIMPLE_TEXT,
FILTER_SIMPLE_TITLE,
FILTER_STORAGE_PATH,
FILTER_TITLE,
FILTER_TITLE_CONTENT,
@@ -195,10 +197,6 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [
name: $localize`Title & content`,
},
{ id: TEXT_FILTER_TARGET_ASN, name: $localize`ASN` },
{
id: TEXT_FILTER_TARGET_CUSTOM_FIELDS,
name: $localize`Custom fields`,
},
{ id: TEXT_FILTER_TARGET_MIME_TYPE, name: $localize`File type` },
{
id: TEXT_FILTER_TARGET_FULLTEXT_QUERY,
@@ -206,6 +204,12 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [
},
]
const DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION = {
// Kept only so legacy saved views can render and be edited away from, remove me eventually
id: TEXT_FILTER_TARGET_CUSTOM_FIELDS,
name: $localize`Custom fields (Deprecated)`,
}
const TEXT_FILTER_TARGET_MORELIKE_OPTION = {
id: TEXT_FILTER_TARGET_FULLTEXT_MORELIKE,
name: $localize`More like`,
@@ -318,8 +322,13 @@ export class FilterEditorComponent
return $localize`Custom fields query`
case FILTER_TITLE:
case FILTER_SIMPLE_TITLE:
return $localize`Title: ${rule.value}`
case FILTER_TITLE_CONTENT:
case FILTER_SIMPLE_TEXT:
return $localize`Title & content: ${rule.value}`
case FILTER_ASN:
return $localize`ASN: ${rule.value}`
@@ -353,12 +362,16 @@ export class FilterEditorComponent
_moreLikeDoc: Document
get textFilterTargets() {
let targets = DEFAULT_TEXT_FILTER_TARGET_OPTIONS
if (this.textFilterTarget == TEXT_FILTER_TARGET_FULLTEXT_MORELIKE) {
return DEFAULT_TEXT_FILTER_TARGET_OPTIONS.concat([
TEXT_FILTER_TARGET_MORELIKE_OPTION,
targets = targets.concat([TEXT_FILTER_TARGET_MORELIKE_OPTION])
}
if (this.textFilterTarget == TEXT_FILTER_TARGET_CUSTOM_FIELDS) {
targets = targets.concat([
DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION,
])
}
return DEFAULT_TEXT_FILTER_TARGET_OPTIONS
return targets
}
textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT
@@ -437,10 +450,12 @@ export class FilterEditorComponent
value.forEach((rule) => {
switch (rule.rule_type) {
case FILTER_TITLE:
case FILTER_SIMPLE_TITLE:
this._textFilter = rule.value
this.textFilterTarget = TEXT_FILTER_TARGET_TITLE
break
case FILTER_TITLE_CONTENT:
case FILTER_SIMPLE_TEXT:
this._textFilter = rule.value
this.textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT
break
@@ -762,12 +777,15 @@ export class FilterEditorComponent
this.textFilterTarget == TEXT_FILTER_TARGET_TITLE_CONTENT
) {
filterRules.push({
rule_type: FILTER_TITLE_CONTENT,
rule_type: FILTER_SIMPLE_TEXT,
value: this._textFilter.trim(),
})
}
if (this._textFilter && this.textFilterTarget == TEXT_FILTER_TARGET_TITLE) {
filterRules.push({ rule_type: FILTER_TITLE, value: this._textFilter })
filterRules.push({
rule_type: FILTER_SIMPLE_TITLE,
value: this._textFilter,
})
}
if (this.textFilterTarget == TEXT_FILTER_TARGET_ASN) {
if (
@@ -1009,7 +1027,10 @@ export class FilterEditorComponent
) {
existingRule = filterRules.find(
(fr) =>
fr.rule_type == FILTER_TITLE_CONTENT || fr.rule_type == FILTER_TITLE
fr.rule_type == FILTER_TITLE_CONTENT ||
fr.rule_type == FILTER_SIMPLE_TEXT ||
fr.rule_type == FILTER_TITLE ||
fr.rule_type == FILTER_SIMPLE_TITLE
)
existingRule.rule_type = FILTER_FULLTEXT_QUERY
}

View File

@@ -3,7 +3,7 @@ import { DataType } from './datatype'
export const NEGATIVE_NULL_FILTER_VALUE = -1
// These correspond to src/documents/models.py and changes here require a DB migration (and vice versa)
export const FILTER_TITLE = 0
export const FILTER_TITLE = 0 // Deprecated in favor of Tantivy-backed `title_search`. Keep for now for existing saved views
export const FILTER_CONTENT = 1
export const FILTER_ASN = 2
@@ -46,7 +46,9 @@ export const FILTER_ADDED_FROM = 46
export const FILTER_MODIFIED_BEFORE = 15
export const FILTER_MODIFIED_AFTER = 16
export const FILTER_TITLE_CONTENT = 19
export const FILTER_TITLE_CONTENT = 19 // Deprecated in favor of Tantivy-backed `text` filtervar. Keep for now for existing saved views
export const FILTER_SIMPLE_TITLE = 48
export const FILTER_SIMPLE_TEXT = 49
export const FILTER_FULLTEXT_QUERY = 20
export const FILTER_FULLTEXT_MORELIKE = 21
@@ -56,7 +58,7 @@ export const FILTER_OWNER_ISNULL = 34
export const FILTER_OWNER_DOES_NOT_INCLUDE = 35
export const FILTER_SHARED_BY_USER = 37
export const FILTER_CUSTOM_FIELDS_TEXT = 36
export const FILTER_CUSTOM_FIELDS_TEXT = 36 // Deprecated. UI no longer includes CF text-search mode. Keep for now for existing saved views
export const FILTER_HAS_CUSTOM_FIELDS_ALL = 38
export const FILTER_HAS_CUSTOM_FIELDS_ANY = 39
export const FILTER_DOES_NOT_HAVE_CUSTOM_FIELDS = 40
@@ -66,6 +68,9 @@ export const FILTER_CUSTOM_FIELDS_QUERY = 42
export const FILTER_MIME_TYPE = 47
export const SIMPLE_TEXT_PARAMETER = 'text'
export const SIMPLE_TITLE_PARAMETER = 'title_search'
export const FILTER_RULE_TYPES: FilterRuleType[] = [
{
id: FILTER_TITLE,
@@ -74,6 +79,13 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [
multi: false,
default: '',
},
{
id: FILTER_SIMPLE_TITLE,
filtervar: SIMPLE_TITLE_PARAMETER,
datatype: 'string',
multi: false,
default: '',
},
{
id: FILTER_CONTENT,
filtervar: 'content__icontains',
@@ -279,6 +291,12 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [
datatype: 'string',
multi: false,
},
{
id: FILTER_SIMPLE_TEXT,
filtervar: SIMPLE_TEXT_PARAMETER,
datatype: 'string',
multi: false,
},
{
id: FILTER_FULLTEXT_QUERY,
filtervar: 'query',

View File

@@ -10,7 +10,7 @@ import {
DOCUMENT_SORT_FIELDS,
DOCUMENT_SORT_FIELDS_FULLTEXT,
} from 'src/app/data/document'
import { FILTER_TITLE } from 'src/app/data/filter-rule-type'
import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type'
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
import { environment } from 'src/environments/environment'
import { PermissionsService } from '../permissions.service'
@@ -138,13 +138,13 @@ describe(`DocumentService`, () => {
subscription = service
.listAllFilteredIds([
{
rule_type: FILTER_TITLE,
rule_type: FILTER_SIMPLE_TITLE,
value: 'apple',
},
])
.subscribe()
const req = httpTestingController.expectOne(
`${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title__icontains=apple`
`${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title_search=apple`
)
expect(req.request.method).toEqual('GET')
})

View File

@@ -8,6 +8,10 @@ import {
FILTER_HAS_CUSTOM_FIELDS_ALL,
FILTER_HAS_CUSTOM_FIELDS_ANY,
FILTER_HAS_TAGS_ALL,
FILTER_SIMPLE_TEXT,
FILTER_SIMPLE_TITLE,
FILTER_TITLE,
FILTER_TITLE_CONTENT,
NEGATIVE_NULL_FILTER_VALUE,
} from '../data/filter-rule-type'
import {
@@ -128,6 +132,26 @@ describe('QueryParams Utils', () => {
is_tagged: 0,
})
params = queryParamsFromFilterRules([
{
rule_type: FILTER_TITLE_CONTENT,
value: 'bank statement',
},
])
expect(params).toEqual({
text: 'bank statement',
})
params = queryParamsFromFilterRules([
{
rule_type: FILTER_TITLE,
value: 'invoice',
},
])
expect(params).toEqual({
title_search: 'invoice',
})
params = queryParamsFromFilterRules([
{
rule_type: FILTER_HAS_TAGS_ALL,
@@ -148,6 +172,30 @@ describe('QueryParams Utils', () => {
it('should convert filter rules to query params', () => {
let rules = filterRulesFromQueryParams(
convertToParamMap({
text: 'bank statement',
})
)
expect(rules).toEqual([
{
rule_type: FILTER_SIMPLE_TEXT,
value: 'bank statement',
},
])
rules = filterRulesFromQueryParams(
convertToParamMap({
title_search: 'invoice',
})
)
expect(rules).toEqual([
{
rule_type: FILTER_SIMPLE_TITLE,
value: 'invoice',
},
])
rules = filterRulesFromQueryParams(
convertToParamMap({
tags__id__all,
})

View File

@@ -9,8 +9,14 @@ import {
FILTER_HAS_CUSTOM_FIELDS_ALL,
FILTER_HAS_CUSTOM_FIELDS_ANY,
FILTER_RULE_TYPES,
FILTER_SIMPLE_TEXT,
FILTER_SIMPLE_TITLE,
FILTER_TITLE,
FILTER_TITLE_CONTENT,
FilterRuleType,
NEGATIVE_NULL_FILTER_VALUE,
SIMPLE_TEXT_PARAMETER,
SIMPLE_TITLE_PARAMETER,
} from '../data/filter-rule-type'
import { ListViewState } from '../services/document-list-view.service'
@@ -97,6 +103,8 @@ export function transformLegacyFilterRules(
export function filterRulesFromQueryParams(
queryParams: ParamMap
): FilterRule[] {
let filterRulesFromQueryParams: FilterRule[] = []
const allFilterRuleQueryParams: string[] = FILTER_RULE_TYPES.map(
(rt) => rt.filtervar
)
@@ -104,7 +112,6 @@ export function filterRulesFromQueryParams(
.filter((rt) => rt !== undefined)
// transform query params to filter rules
let filterRulesFromQueryParams: FilterRule[] = []
allFilterRuleQueryParams
.filter((frqp) => queryParams.has(frqp))
.forEach((filterQueryParamName) => {
@@ -146,7 +153,17 @@ export function queryParamsFromFilterRules(filterRules: FilterRule[]): Params {
let params = {}
for (let rule of filterRules) {
let ruleType = FILTER_RULE_TYPES.find((t) => t.id == rule.rule_type)
if (ruleType.isnull_filtervar && rule.value == null) {
if (
rule.rule_type === FILTER_TITLE_CONTENT ||
rule.rule_type === FILTER_SIMPLE_TEXT
) {
params[SIMPLE_TEXT_PARAMETER] = rule.value
} else if (
rule.rule_type === FILTER_TITLE ||
rule.rule_type === FILTER_SIMPLE_TITLE
) {
params[SIMPLE_TITLE_PARAMETER] = rule.value
} else if (ruleType.isnull_filtervar && rule.value == null) {
params[ruleType.isnull_filtervar] = 1
} else if (
ruleType.isnull_filtervar &&

View File

@@ -50,14 +50,9 @@ from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileGenerationChoices
from paperless.parsers import ParserContext
from paperless.parsers import ParserProtocol
from paperless.parsers.registry import get_parser_registry
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
from paperless.parsers.utils import extract_pdf_text
from paperless.parsers.utils import is_tagged_pdf
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -110,44 +105,6 @@ class ConsumerStatusShortMessage(StrEnum):
FAILED = "failed"
def should_produce_archive(
parser: "ParserProtocol",
mime_type: str,
document_path: Path,
) -> bool:
"""Return True if a PDF/A archive should be produced for this document.
IMPORTANT: *parser* must be an instantiated parser, not the class.
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
``@property`` methods — accessing them on the class returns the descriptor
(always truthy).
"""
# Must produce a PDF so the frontend can display the original format at all.
if parser.requires_pdf_rendition:
return True
# Parser cannot produce an archive (e.g. TextDocumentParser).
if not parser.can_produce_archive:
return False
generation = OcrConfig().archive_file_generation
if generation == ArchiveFileGenerationChoices.ALWAYS:
return True
if generation == ArchiveFileGenerationChoices.NEVER:
return False
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
if mime_type.startswith("image/"):
return True
if mime_type == "application/pdf":
if is_tagged_pdf(document_path):
return False
text = extract_pdf_text(document_path)
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
return False
class ConsumerPluginMixin:
if TYPE_CHECKING:
from logging import Logger
@@ -481,16 +438,7 @@ class ConsumerPlugin(
)
self.log.debug(f"Parsing {self.filename}...")
produce_archive = should_produce_archive(
document_parser,
mime_type,
self.working_copy,
)
document_parser.parse(
self.working_copy,
mime_type,
produce_archive=produce_archive,
)
document_parser.parse(self.working_copy, mime_type)
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(
@@ -839,7 +787,7 @@ class ConsumerPlugin(
return document
def apply_overrides(self, document: Document) -> None:
def apply_overrides(self, document) -> None:
if self.metadata.correspondent_id:
document.correspondent = Correspondent.objects.get(
pk=self.metadata.correspondent_id,

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import functools
import inspect
import json
import logging
import operator
from contextlib import contextmanager
from typing import TYPE_CHECKING
@@ -77,6 +78,8 @@ DATETIME_KWARGS = [
CUSTOM_FIELD_QUERY_MAX_DEPTH = 10
CUSTOM_FIELD_QUERY_MAX_ATOMS = 20
logger = logging.getLogger("paperless.api")
class CorrespondentFilterSet(FilterSet):
class Meta:
@@ -162,9 +165,13 @@ class InboxFilter(Filter):
@extend_schema_field(serializers.CharField)
class TitleContentFilter(Filter):
# Deprecated but retained for existing saved views. UI uses Tantivy-backed `text` / `title_search` params.
def filter(self, qs: Any, value: Any) -> Any:
value = value.strip() if isinstance(value, str) else value
if value:
logger.warning(
"Deprecated document filter parameter 'title_content' used; use `text` instead.",
)
try:
return qs.filter(
Q(title__icontains=value) | Q(effective_content__icontains=value),
@@ -243,6 +250,9 @@ class CustomFieldsFilter(Filter):
def filter(self, qs, value):
value = value.strip() if isinstance(value, str) else value
if value:
logger.warning(
"Deprecated document filter parameter 'custom_fields__icontains' used; use `custom_field_query` or advanced Tantivy field syntax instead.",
)
fields_with_matching_selects = CustomField.objects.filter(
extra_data__icontains=value,
)
@@ -747,6 +757,7 @@ class DocumentFilterSet(FilterSet):
is_in_inbox = InboxFilter()
# Deprecated, but keep for now for existing saved views
title_content = TitleContentFilter()
content__istartswith = EffectiveContentFilter(lookup_expr="istartswith")
@@ -756,6 +767,7 @@ class DocumentFilterSet(FilterSet):
owner__id__none = ObjectFilter(field_name="owner", exclude=True)
# Deprecated, UI no longer includes CF text-search mode, but keep for now for existing saved views
custom_fields__icontains = CustomFieldsFilter()
custom_fields__id__all = ObjectFilter(field_name="custom_fields__field")

View File

@@ -0,0 +1,92 @@
# Generated by Django 5.2.12 on 2026-04-01 18:20
from django.db import migrations
from django.db import models
OLD_TITLE_RULE = 0
OLD_TITLE_CONTENT_RULE = 19
NEW_SIMPLE_TITLE_RULE = 48
NEW_SIMPLE_TEXT_RULE = 49
# See documents/models.py SavedViewFilterRule
def migrate_saved_view_rules_forward(apps, schema_editor):
SavedViewFilterRule = apps.get_model("documents", "SavedViewFilterRule")
SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_RULE).update(
rule_type=NEW_SIMPLE_TITLE_RULE,
)
SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_CONTENT_RULE).update(
rule_type=NEW_SIMPLE_TEXT_RULE,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0017_migrate_fulltext_query_field_prefixes"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveSmallIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
(26, "has correspondent in"),
(27, "does not have correspondent in"),
(28, "has document type in"),
(29, "does not have document type in"),
(30, "has storage path in"),
(31, "does not have storage path in"),
(32, "owner is"),
(33, "has owner in"),
(34, "does not have owner"),
(35, "does not have owner in"),
(36, "has custom field value"),
(37, "is shared by me"),
(38, "has custom fields"),
(39, "has custom field in"),
(40, "does not have custom field in"),
(41, "does not have custom field"),
(42, "custom fields query"),
(43, "created to"),
(44, "created from"),
(45, "added to"),
(46, "added from"),
(47, "mime type is"),
(48, "simple title search"),
(49, "simple text search"),
],
verbose_name="rule type",
),
),
migrations.RunPython(
migrate_saved_view_rules_forward,
migrations.RunPython.noop,
),
]

View File

@@ -623,6 +623,8 @@ class SavedViewFilterRule(models.Model):
(45, _("added to")),
(46, _("added from")),
(47, _("mime type is")),
(48, _("simple title search")),
(49, _("simple text search")),
]
saved_view = models.ForeignKey(

View File

@@ -1,4 +1,6 @@
from documents.search._backend import SORT_FIELD_MAP
from documents.search._backend import SearchIndexLockError
from documents.search._backend import SearchMode
from documents.search._backend import SearchResults
from documents.search._backend import TantivyBackend
from documents.search._backend import TantivyRelevanceList
@@ -9,7 +11,9 @@ from documents.search._schema import needs_rebuild
from documents.search._schema import wipe_index
__all__ = [
"SORT_FIELD_MAP",
"SearchIndexLockError",
"SearchMode",
"SearchResults",
"TantivyBackend",
"TantivyRelevanceList",

View File

@@ -2,11 +2,11 @@ from __future__ import annotations
import logging
import threading
import unicodedata
from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
from enum import StrEnum
from typing import TYPE_CHECKING
from typing import Self
from typing import TypedDict
@@ -19,7 +19,10 @@ from django.conf import settings
from django.utils.timezone import get_current_timezone
from guardian.shortcuts import get_users_with_perms
from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
from documents.search._schema import _write_sentinels
from documents.search._schema import build_schema
@@ -44,15 +47,25 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted
T = TypeVar("T")
# Fields tantivy can sort natively — maps Django ORM field names to tantivy schema fields.
# Fields not listed here (owner, storage_path__name, id, custom_field_*) must fall back to ORM.
SORT_FIELD_MAP: dict[str, str] = {
"title": "title_sort",
"correspondent__name": "correspondent_sort",
"document_type__name": "type_sort",
"created": "created",
"added": "added",
"modified": "modified",
"archive_serial_number": "asn",
"page_count": "page_count",
"num_notes": "num_notes",
}
def _ascii_fold(s: str) -> str:
"""
Normalize unicode to ASCII equivalent characters for search consistency.
Converts accented characters (e.g., "café") to their ASCII base forms ("cafe")
to enable cross-language searching without requiring exact diacritic matching.
"""
return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
class SearchMode(StrEnum):
QUERY = "query"
TEXT = "text"
TITLE = "title"
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
@@ -74,7 +87,7 @@ def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
)
continue
for token in tokens:
normalized = _ascii_fold(token.lower())
normalized = ascii_fold(token.lower())
if normalized:
words.add(normalized)
return words
@@ -294,8 +307,10 @@ class TantivyBackend:
doc.add_text("checksum", document.checksum)
doc.add_text("title", document.title)
doc.add_text("title_sort", document.title)
doc.add_text("simple_title", document.title)
doc.add_text("content", content)
doc.add_text("bigram_content", content)
doc.add_text("simple_content", content)
# Original filename - only add if not None/empty
if document.original_filename:
@@ -433,6 +448,7 @@ class TantivyBackend:
sort_field: str | None,
*,
sort_reverse: bool,
search_mode: SearchMode = SearchMode.QUERY,
) -> SearchResults:
"""
Execute a search query against the document index.
@@ -441,20 +457,32 @@ class TantivyBackend:
permission filtering before executing against Tantivy. Supports both
relevance-based and field-based sorting.
QUERY search mode supports natural date keywords, field filters, etc.
TITLE search mode treats the query as plain text to search for in title only
TEXT search mode treats the query as plain text to search for in title and content
Args:
query: User's search query (supports natural date keywords, field filters)
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
search_mode: "query" for advanced Tantivy syntax, "text" for
plain-text search over title and content only, "title" for
plain-text search over title only
Returns:
SearchResults with hits, total count, and processed query
"""
self._ensure_open()
tz = get_current_timezone()
user_query = parse_user_query(self._index, query, tz)
if search_mode is SearchMode.TEXT:
user_query = parse_simple_text_query(self._index, query)
elif search_mode is SearchMode.TITLE:
user_query = parse_simple_title_query(self._index, query)
else:
user_query = parse_user_query(self._index, query, tz)
# Apply permission filter if user is not None (not superuser)
if user is not None:
@@ -471,22 +499,9 @@ class TantivyBackend:
searcher = self._index.searcher()
offset = (page - 1) * page_size
# Map sort fields
sort_field_map = {
"title": "title_sort",
"correspondent__name": "correspondent_sort",
"document_type__name": "type_sort",
"created": "created",
"added": "added",
"modified": "modified",
"archive_serial_number": "asn",
"page_count": "page_count",
"num_notes": "num_notes",
}
# Perform search
if sort_field and sort_field in sort_field_map:
mapped_field = sort_field_map[sort_field]
if sort_field and sort_field in SORT_FIELD_MAP:
mapped_field = SORT_FIELD_MAP[sort_field]
results = searcher.search(
final_query,
limit=offset + page_size,
@@ -594,7 +609,7 @@ class TantivyBackend:
List of word suggestions ordered by frequency, then alphabetically
"""
self._ensure_open()
normalized_term = _ascii_fold(term.lower())
normalized_term = ascii_fold(term.lower())
searcher = self._index.searcher()

View File

@@ -0,0 +1,8 @@
from __future__ import annotations
import unicodedata
def ascii_fold(text: str) -> str:
"""Normalize unicode text to ASCII equivalents for search consistency."""
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()

View File

@@ -12,6 +12,8 @@ import tantivy
from dateutil.relativedelta import relativedelta
from django.conf import settings
from documents.search._normalize import ascii_fold
if TYPE_CHECKING:
from datetime import tzinfo
@@ -51,6 +53,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
)
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
def _fmt(dt: datetime) -> str:
@@ -436,7 +439,27 @@ DEFAULT_SEARCH_FIELDS = [
"document_type",
"tag",
]
SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"]
TITLE_SEARCH_FIELDS = ["simple_title"]
_FIELD_BOOSTS = {"title": 2.0}
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
def _build_simple_field_query(
index: tantivy.Index,
field: str,
tokens: list[str],
) -> tantivy.Query:
patterns = [f".*{regex.escape(token)}.*" for token in tokens]
if len(patterns) == 1:
query = tantivy.Query.regex_query(index.schema, field, patterns[0])
else:
query = tantivy.Query.regex_phrase_query(index.schema, field, patterns)
boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0)
if boost > 1.0:
return tantivy.Query.boost_query(query, boost)
return query
def parse_user_query(
@@ -495,3 +518,52 @@ def parse_user_query(
)
return exact
def parse_simple_query(
index: tantivy.Index,
raw_query: str,
fields: list[str],
) -> tantivy.Query:
"""
Parse a plain-text query using Tantivy over a restricted field set.
Query string is escaped and normalized to be treated as "simple" text query.
"""
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
tokens = [token for token in tokens if token]
if not tokens:
return tantivy.Query.empty_query()
field_queries = [
(tantivy.Occur.Should, _build_simple_field_query(index, field, tokens))
for field in fields
]
if len(field_queries) == 1:
return field_queries[0][1]
return tantivy.Query.boolean_query(field_queries)
def parse_simple_text_query(
index: tantivy.Index,
raw_query: str,
) -> tantivy.Query:
"""
Parse a plain-text query over title/content for simple search inputs.
"""
return parse_simple_query(index, raw_query, SIMPLE_SEARCH_FIELDS)
def parse_simple_title_query(
index: tantivy.Index,
raw_query: str,
) -> tantivy.Query:
"""
Parse a plain-text query over the title field only.
"""
return parse_simple_query(index, raw_query, TITLE_SEARCH_FIELDS)

View File

@@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema:
# CJK support - not stored, indexed only
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
# Simple substring search support for title/content - not stored, indexed only
sb.add_text_field(
"simple_title",
stored=False,
tokenizer_name="simple_search_analyzer",
)
sb.add_text_field(
"simple_content",
stored=False,
tokenizer_name="simple_search_analyzer",
)
# Autocomplete prefix scan - stored, not indexed
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

View File

@@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
# Fast-field tokenizer required for fast=True text fields in the schema
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
@@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
.filter(tantivy.Filter.lowercase())
.build()
)
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.regex(r"\S+"),
)
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
)

View File

@@ -30,7 +30,6 @@ from documents.consumer import AsnCheckPlugin
from documents.consumer import ConsumerPlugin
from documents.consumer import ConsumerPreflightPlugin
from documents.consumer import WorkflowTriggerPlugin
from documents.consumer import should_produce_archive
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.double_sided import CollatePlugin
@@ -302,16 +301,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
parser.configure(ParserContext())
try:
produce_archive = should_produce_archive(
parser,
mime_type,
document.source_path,
)
parser.parse(
document.source_path,
mime_type,
produce_archive=produce_archive,
)
parser.parse(document.source_path, mime_type)
thumbnail = parser.get_thumbnail(document.source_path, mime_type)

View File

@@ -5,6 +5,7 @@ from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import Note
from documents.search._backend import SearchMode
from documents.search._backend import TantivyBackend
from documents.search._backend import get_backend
from documents.search._backend import reset_backend
@@ -46,6 +47,216 @@ class TestWriteBatch:
class TestSearch:
"""Test search functionality."""
def test_text_mode_limits_default_search_to_title_and_content(
self,
backend: TantivyBackend,
):
"""Simple text mode must not match metadata-only fields."""
doc = Document.objects.create(
title="Invoice document",
content="monthly statement",
checksum="TXT1",
pk=9,
)
backend.add_or_update(doc)
metadata_only = backend.search(
"document_type:invoice",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert metadata_only.total == 0
content_match = backend.search(
"monthly",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert content_match.total == 1
def test_title_mode_limits_default_search_to_title_only(
self,
backend: TantivyBackend,
):
"""Title mode must not match content-only terms."""
doc = Document.objects.create(
title="Invoice document",
content="monthly statement",
checksum="TXT2",
pk=10,
)
backend.add_or_update(doc)
content_only = backend.search(
"monthly",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert content_only.total == 0
title_match = backend.search(
"invoice",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert title_match.total == 1
def test_text_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Simple text mode should support substring matching within tokens."""
doc = Document.objects.create(
title="Account access",
content="password reset instructions",
checksum="TXT3",
pk=11,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo re",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert phrase_match.total == 1
def test_text_mode_does_not_match_on_partial_term_overlap(
self,
backend: TantivyBackend,
):
"""Simple text mode should not match documents that merely share partial fragments."""
doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="TXT7",
pk=13,
)
backend.add_or_update(doc)
non_match = backend.search(
"raptor",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert non_match.total == 0
def test_text_mode_ignores_queries_without_searchable_tokens(
self,
backend: TantivyBackend,
):
"""Simple text mode should safely return no hits for symbol-only strings."""
doc = Document.objects.create(
title="Guide",
content="This is a guide.",
checksum="TXT8",
pk=14,
)
backend.add_or_update(doc)
no_tokens = backend.search(
"!!!",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert no_tokens.total == 0
def test_title_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Title mode should support substring matching within title tokens."""
doc = Document.objects.create(
title="Password guide",
content="reset instructions",
checksum="TXT4",
pk=12,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo gu",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert phrase_match.total == 1
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):

View File

@@ -8,6 +8,7 @@ import tantivy
from documents.search._tokenizer import _bigram_analyzer
from documents.search._tokenizer import _paperless_text
from documents.search._tokenizer import _simple_search_analyzer
from documents.search._tokenizer import register_tokenizers
if TYPE_CHECKING:
@@ -41,6 +42,20 @@ class TestTokenizers:
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
return idx
@pytest.fixture
def simple_search_index(self) -> tantivy.Index:
"""Index with simple-search field for Latin substring tests."""
sb = tantivy.SchemaBuilder()
sb.add_text_field(
"simple_content",
stored=False,
tokenizer_name="simple_search_analyzer",
)
schema = sb.build()
idx = tantivy.Index(schema, path=None)
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
return idx
def test_ascii_fold_finds_accented_content(
self,
content_index: tantivy.Index,
@@ -66,6 +81,24 @@ class TestTokenizers:
q = bigram_index.parse_query("東京", ["bigram_content"])
assert bigram_index.searcher().search(q, limit=5).count == 1
def test_simple_search_analyzer_supports_regex_substrings(
self,
simple_search_index: tantivy.Index,
) -> None:
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
writer = simple_search_index.writer()
doc = tantivy.Document()
doc.add_text("simple_content", "tag:invoice password-reset")
writer.add_document(doc)
writer.commit()
simple_search_index.reload()
q = tantivy.Query.regex_query(
simple_search_index.schema,
"simple_content",
".*sswo.*",
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder()

View File

@@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
"pages": None,
"language": None,
"mode": None,
"archive_file_generation": None,
"skip_archive_file": None,
"image_dpi": None,
"unpaper_clean": None,
"deskew": None,

View File

@@ -91,6 +91,127 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data["count"], 0)
self.assertEqual(len(results), 0)
def test_simple_text_search(self) -> None:
tagged = Tag.objects.create(name="invoice")
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Monthly bank report",
checksum="T1",
pk=11,
)
matching_doc.tags.add(tagged)
metadata_only_doc = Document.objects.create(
title="Completely unrelated",
content="No matching terms here",
checksum="T2",
pk=12,
)
metadata_only_doc.tags.add(tagged)
backend = get_backend()
backend.add_or_update(matching_doc)
backend.add_or_update(metadata_only_doc)
response = self.client.get("/api/documents/?text=monthly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=tag:invoice")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_text_search_matches_substrings(self) -> None:
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Password reset instructions",
checksum="T5",
pk=15,
)
backend = get_backend()
backend.add_or_update(matching_doc)
response = self.client.get("/api/documents/?text=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo re")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
non_matching_doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="T7",
pk=17,
)
backend = get_backend()
backend.add_or_update(non_matching_doc)
response = self.client.get("/api/documents/?text=raptor")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_title_search(self) -> None:
title_match = Document.objects.create(
title="Quarterly summary",
content="No matching content here",
checksum="T3",
pk=13,
)
content_only = Document.objects.create(
title="Completely unrelated",
content="Quarterly summary appears only in content",
checksum="T4",
pk=14,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
response = self.client.get("/api/documents/?title_search=quarterly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_simple_title_search_matches_substrings(self) -> None:
title_match = Document.objects.create(
title="Password handbook",
content="No matching content here",
checksum="T6",
pk=16,
)
backend = get_backend()
backend.add_or_update(title_match)
response = self.client.get("/api/documents/?title_search=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo hand")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_search_returns_all_for_api_version_9(self) -> None:
d1 = Document.objects.create(
title="invoice",
@@ -1493,6 +1614,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
def test_global_search_db_only_limits_documents_to_title_matches(self) -> None:
title_match = Document.objects.create(
title="bank statement",
content="no additional terms",
checksum="GS1",
pk=21,
)
content_only = Document.objects.create(
title="not a title match",
content="bank appears only in content",
checksum="GS2",
pk=22,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
self.client.force_authenticate(self.user)
response = self.client.get("/api/search/?query=bank&db_only=true")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(len(response.data["documents"]), 1)
self.assertEqual(response.data["documents"][0]["id"], title_match.id)
def test_global_search_filters_owned_mail_objects(self) -> None:
user1 = User.objects.create_user("mail-search-user")
user2 = User.objects.create_user("other-mail-search-user")

View File

@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
CONSUMER_TAG_BARCODE_SPLIT=True,
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
CELERY_TASK_ALWAYS_EAGER=True,
OCR_MODE="auto",
OCR_MODE="skip",
)
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
"""

View File

@@ -230,11 +230,7 @@ class TestConsumer(
shutil.copy(src, dst)
return dst
@override_settings(
FILENAME_FORMAT=None,
TIME_ZONE="America/Chicago",
ARCHIVE_FILE_GENERATION="always",
)
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
def testNormalOperation(self) -> None:
filename = self.get_test_file()
@@ -633,10 +629,7 @@ class TestConsumer(
# Database empty
self.assertEqual(Document.objects.all().count(), 0)
@override_settings(
FILENAME_FORMAT="{correspondent}/{title}",
ARCHIVE_FILE_GENERATION="always",
)
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
def testFilenameHandling(self) -> None:
with self.get_consumer(
self.get_test_file(),
@@ -653,7 +646,7 @@ class TestConsumer(
self._assert_first_last_send_progress()
@mock.patch("documents.consumer.generate_unique_filename")
@override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always")
@override_settings(FILENAME_FORMAT="{pk}")
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
m.side_effect = lambda doc, archive_filename=False: Path(
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
@@ -680,10 +673,7 @@ class TestConsumer(
self._assert_first_last_send_progress()
@override_settings(
FILENAME_FORMAT="{correspondent}/{title}",
ARCHIVE_FILE_GENERATION="always",
)
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.generate_unique_filename")
def testFilenameHandlingUnstableFormat(self, m) -> None:
filenames = ["this", "that", "now this", "i cannot decide"]
@@ -1031,7 +1021,7 @@ class TestConsumer(
self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
@override_settings(FILENAME_FORMAT="{title}")
@mock.patch("documents.consumer.get_parser_registry")
def test_similar_filenames(self, m) -> None:
shutil.copy(
@@ -1142,7 +1132,6 @@ class TestConsumer(
mock_mail_parser_parse.assert_called_once_with(
consumer.working_copy,
"message/rfc822",
produce_archive=True,
)
@@ -1290,14 +1279,7 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
def test_no_pre_consume_script(self, m) -> None:
with self.get_consumer(self.test_file) as c:
c.run()
# Verify no pre-consume script subprocess was invoked
# (run_subprocess may still be called by _extract_text_for_archive_check)
script_calls = [
call
for call in m.call_args_list
if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
]
self.assertEqual(script_calls, [])
m.assert_not_called()
@mock.patch("documents.consumer.run_subprocess")
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
@@ -1313,16 +1295,9 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
with self.get_consumer(self.test_file) as c:
c.run()
self.assertTrue(m.called)
m.assert_called_once()
# Find the call that invoked the pre-consume script
# (run_subprocess may also be called by _extract_text_for_archive_check)
script_call = next(
call
for call in m.call_args_list
if call.args and call.args[0] and call.args[0][0] == script.name
)
args, _ = script_call
args, _ = m.call_args
command = args[0]
environment = args[1]

View File

@@ -1,189 +0,0 @@
"""Tests for should_produce_archive()."""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from unittest.mock import MagicMock
import pytest
from documents.consumer import should_produce_archive
if TYPE_CHECKING:
from pytest_mock import MockerFixture
def _parser_instance(
*,
can_produce: bool = True,
requires_rendition: bool = False,
) -> MagicMock:
"""Return a mock parser instance with the given capability flags."""
instance = MagicMock()
instance.can_produce_archive = can_produce
instance.requires_pdf_rendition = requires_rendition
return instance
@pytest.fixture()
def null_app_config(mocker) -> MagicMock:
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
return mocker.MagicMock(
output_type=None,
pages=None,
language=None,
mode=None,
archive_file_generation=None,
image_dpi=None,
unpaper_clean=None,
deskew=None,
rotate_pages=None,
rotate_pages_threshold=None,
max_image_pixels=None,
color_conversion_strategy=None,
user_args=None,
)
@pytest.fixture(autouse=True)
def patch_app_config(mocker, null_app_config):
"""Patch BaseConfig._get_config_instance for all tests in this module."""
mocker.patch(
"paperless.config.BaseConfig._get_config_instance",
return_value=null_app_config,
)
class TestShouldProduceArchive:
@pytest.mark.parametrize(
("generation", "can_produce", "requires_rendition", "mime", "expected"),
[
pytest.param(
"never",
True,
False,
"application/pdf",
False,
id="never-returns-false",
),
pytest.param(
"always",
True,
False,
"application/pdf",
True,
id="always-returns-true",
),
pytest.param(
"never",
True,
True,
"application/pdf",
True,
id="requires-rendition-overrides-never",
),
pytest.param(
"always",
False,
False,
"text/plain",
False,
id="cannot-produce-overrides-always",
),
pytest.param(
"always",
False,
True,
"application/pdf",
True,
id="requires-rendition-wins-even-if-cannot-produce",
),
pytest.param(
"auto",
True,
False,
"image/tiff",
True,
id="auto-image-returns-true",
),
pytest.param(
"auto",
True,
False,
"message/rfc822",
False,
id="auto-non-pdf-non-image-returns-false",
),
],
)
def test_generation_setting(
self,
settings,
generation: str,
can_produce: bool, # noqa: FBT001
requires_rendition: bool, # noqa: FBT001
mime: str,
expected: bool, # noqa: FBT001
) -> None:
settings.ARCHIVE_FILE_GENERATION = generation
parser = _parser_instance(
can_produce=can_produce,
requires_rendition=requires_rendition,
)
assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected
@pytest.mark.parametrize(
("extracted_text", "expected"),
[
pytest.param(
"This is a born-digital PDF with lots of text content. " * 10,
False,
id="born-digital-long-text-skips-archive",
),
pytest.param(None, True, id="no-text-scanned-produces-archive"),
pytest.param("tiny", True, id="short-text-treated-as-scanned"),
],
)
def test_auto_pdf_archive_decision(
self,
mocker: MockerFixture,
settings,
extracted_text: str | None,
expected: bool, # noqa: FBT001
) -> None:
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
parser = _parser_instance(can_produce=True, requires_rendition=False)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is expected
)
def test_tagged_pdf_skips_archive_in_auto_mode(
self,
mocker: MockerFixture,
settings,
) -> None:
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
parser = _parser_instance(can_produce=True, requires_rendition=False)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is False
)
def test_tagged_pdf_does_not_call_pdftotext(
self,
mocker: MockerFixture,
settings,
) -> None:
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
parser = _parser_instance(can_produce=True, requires_rendition=False)
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
mock_extract.assert_not_called()

View File

@@ -27,10 +27,7 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
@pytest.mark.management
@override_settings(
FILENAME_FORMAT="{correspondent}/{title}",
ARCHIVE_FILE_GENERATION="always",
)
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def make_models(self):
return Document.objects.create(

View File

@@ -213,7 +213,6 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.global_objects.count(), 0)
@override_settings(ARCHIVE_FILE_GENERATION="always")
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self) -> None:
"""

View File

@@ -1995,11 +1995,23 @@ class ChatStreamingView(GenericAPIView):
list=extend_schema(
description="Document views including search",
parameters=[
OpenApiParameter(
name="text",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Simple Tantivy-backed text search query string",
),
OpenApiParameter(
name="title_search",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Simple Tantivy-backed title-only search query string",
),
OpenApiParameter(
name="query",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Advanced search query string",
description="Advanced Tantivy search query string",
),
OpenApiParameter(
name="full_perms",
@@ -2033,7 +2045,9 @@ class UnifiedSearchViewSet(DocumentViewSet):
def _is_search_request(self):
return (
"query" in self.request.query_params
"text" in self.request.query_params
or "title_search" in self.request.query_params
or "query" in self.request.query_params
or "more_like_id" in self.request.query_params
)
@@ -2041,26 +2055,66 @@ class UnifiedSearchViewSet(DocumentViewSet):
if not self._is_search_request():
return super().list(request)
from documents.search import SORT_FIELD_MAP
from documents.search import SearchMode
from documents.search import TantivyRelevanceList
from documents.search import get_backend
try:
backend = get_backend()
# ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied)
# ORM queryset with field filters applied (tags, correspondent, etc.); used to
# intersect with tantivy results. Tantivy handles permission filtering via user param.
filtered_qs = self.filter_queryset(self.get_queryset())
user = None if request.user.is_superuser else request.user
if "query" in request.query_params:
query_str = request.query_params["query"]
results = backend.search(
query_str,
user=user,
page=1,
page_size=10000,
sort_field=None,
sort_reverse=False,
)
# Parse ordering: extract field name and direction
raw_ordering = request.query_params.get("ordering", "")
sort_reverse = raw_ordering.startswith("-")
sort_field = raw_ordering.lstrip("-") or None
# Use tantivy native sorting for indexed fields; fall back to ORM for the rest
# (owner, storage_path__name, id, custom_field_* require ORM ordering)
tantivy_sort_field = sort_field if sort_field in SORT_FIELD_MAP else None
if (
"text" in request.query_params
or "title_search" in request.query_params
or "query" in request.query_params
):
if "text" in request.query_params:
search_mode = SearchMode.TEXT
query_str = request.query_params["text"]
elif "title_search" in request.query_params:
search_mode = SearchMode.TITLE
query_str = request.query_params["title_search"]
else:
search_mode = SearchMode.QUERY
query_str = request.query_params["query"]
if tantivy_sort_field:
# Tantivy handles sorting and pagination natively
page_size = self.paginator.get_page_size(request)
page_num = int(request.query_params.get("page", 1))
results = backend.search(
query_str,
user=user,
page=page_num,
page_size=page_size,
sort_field=tantivy_sort_field,
sort_reverse=sort_reverse,
search_mode=search_mode,
)
else:
# Relevance or ORM-sorted fallback: fetch all hits for downstream ordering
results = backend.search(
query_str,
user=user,
page=1,
page_size=10000,
sort_field=None,
sort_reverse=False,
search_mode=search_mode,
)
else:
# more_like_id — validate permission on the seed document first
try:
@@ -2084,17 +2138,44 @@ class UnifiedSearchViewSet(DocumentViewSet):
page=1,
page_size=10000,
)
tantivy_sort_field = None # MLT always uses relevance order
hits_by_id = {h["id"]: h for h in results.hits}
# Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order
ordering_param = request.query_params.get("ordering", "").lstrip("-")
if not ordering_param:
# Preserve Tantivy relevance order; intersect with ORM-visible IDs
if tantivy_sort_field:
# Tantivy sorted and paginated; intersect with ORM to apply field filters
orm_ids = set(filtered_qs.values_list("pk", flat=True))
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
serializer = self.get_serializer(ordered_hits, many=True)
response = self.get_paginated_response(serializer.data)
response.data["corrected_query"] = None
response.data["count"] = results.total
if get_boolean(
str(request.query_params.get("include_selection_data", "false")),
):
# Fetch all matched IDs for selection data (separate query, ID-only)
all_results = backend.search(
query_str,
user=user,
page=1,
page_size=10000,
sort_field=None,
sort_reverse=False,
search_mode=search_mode,
)
all_ids = [h["id"] for h in all_results.hits]
response.data["selection_data"] = (
self._get_selection_data_for_queryset(
filtered_qs.filter(pk__in=all_ids),
)
)
return response
elif not sort_field:
# Relevance order — preserve tantivy ranking, intersect with ORM for field filters
orm_ids = set(filtered_qs.values_list("pk", flat=True))
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
else:
# Use ORM ordering (already applied by DocumentsOrderingFilter)
# ORM ordering fallback for fields tantivy can't sort (owner, storage_path__name, id, custom_field_*)
hit_ids = set(hits_by_id.keys())
orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
"pk",
@@ -3003,6 +3084,9 @@ class GlobalSearchView(PassUserMixin):
serializer_class = SearchResultSerializer
def get(self, request, *args, **kwargs):
from documents.search import get_backend
from documents.search._backend import SearchMode
query = request.query_params.get("query", None)
if query is None:
return HttpResponseBadRequest("Query required")
@@ -3019,25 +3103,25 @@ class GlobalSearchView(PassUserMixin):
"view_document",
Document,
)
# First search by title
docs = all_docs.filter(title__icontains=query)
if not db_only and len(docs) < OBJECT_LIMIT:
# If we don't have enough results, search by content.
# Over-fetch from Tantivy (no permission filter) and rely on
# the ORM all_docs queryset for authoritative permission gating.
from documents.search import get_backend
if db_only:
docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT]
else:
user = None if request.user.is_superuser else request.user
fts_results = get_backend().search(
query,
user=None,
user=user,
page=1,
page_size=1000,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
fts_ids = {h["id"] for h in fts_results.hits}
docs = docs | all_docs.filter(id__in=fts_ids)
docs = docs[:OBJECT_LIMIT]
docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits])
docs = [
docs_by_id[hit["id"]]
for hit in fts_results.hits
if hit["id"] in docs_by_id
][:OBJECT_LIMIT]
saved_views = (
get_objects_for_user_owner_aware(
request.user,

View File

@@ -5,7 +5,6 @@ import shutil
import stat
import subprocess
from pathlib import Path
from typing import Any
from django.conf import settings
from django.core.checks import Error
@@ -23,7 +22,7 @@ writeable_hint = (
)
def path_check(var: str, directory: Path) -> list[Error]:
def path_check(var, directory: Path) -> list[Error]:
messages: list[Error] = []
if directory:
if not directory.is_dir():
@@ -60,7 +59,7 @@ def path_check(var: str, directory: Path) -> list[Error]:
@register()
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
def paths_check(app_configs, **kwargs) -> list[Error]:
"""
Check the various paths for existence, readability and writeability
"""
@@ -74,7 +73,7 @@ def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
@register()
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
def binaries_check(app_configs, **kwargs):
"""
Paperless requires the existence of a few binaries, so we do some checks
for those here.
@@ -94,7 +93,7 @@ def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
@register()
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
def debug_mode_check(app_configs, **kwargs):
if settings.DEBUG:
return [
Warning(
@@ -110,7 +109,7 @@ def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
@register()
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
def settings_values_check(app_configs, **kwargs):
"""
Validates at least some of the user provided settings
"""
@@ -133,14 +132,23 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}:
if settings.OCR_MODE == "skip_noarchive":
msgs.append(
Warning(
'OCR output mode "skip_noarchive" is deprecated and will be '
"removed in a future version. Please use "
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
),
)
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
msgs.append(
Error(
"PAPERLESS_ARCHIVE_FILE_GENERATION setting "
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
"OCR_SKIP_ARCHIVE_FILE setting "
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
),
)
@@ -183,7 +191,7 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
@register()
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
def audit_log_check(app_configs, **kwargs):
db_conn = connections["default"]
all_tables = db_conn.introspection.table_names()
result = []
@@ -295,42 +303,7 @@ def check_deprecated_db_settings(
@register()
def check_deprecated_v2_ocr_env_vars(
app_configs: object,
**kwargs: object,
) -> list[Warning]:
"""Warn when deprecated v2 OCR environment variables are set.
Users upgrading from v2 may still have these in their environment or
config files, where they are now silently ignored.
"""
warnings: list[Warning] = []
if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"):
warnings.append(
Warning(
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. "
"Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.",
id="paperless.W002",
),
)
ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "")
if ocr_mode in {"skip", "skip_noarchive"}:
warnings.append(
Warning(
f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. "
f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never "
f"if you used skip_noarchive) instead.",
id="paperless.W003",
),
)
return warnings
@register()
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
):
@@ -356,7 +329,7 @@ def get_tesseract_langs():
@register()
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
def check_default_language_available(app_configs, **kwargs):
errs = []
if not settings.OCR_LANGUAGE:

View File

@@ -4,11 +4,6 @@ import json
from django.conf import settings
from paperless.models import ApplicationConfiguration
from paperless.models import ArchiveFileGenerationChoices
from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
@dataclasses.dataclass
@@ -33,7 +28,7 @@ class OutputTypeConfig(BaseConfig):
Almost all parsers care about the chosen PDF output format
"""
output_type: OutputTypeChoices = dataclasses.field(init=False)
output_type: str = dataclasses.field(init=False)
def __post_init__(self) -> None:
app_config = self._get_config_instance()
@@ -50,17 +45,15 @@ class OcrConfig(OutputTypeConfig):
pages: int | None = dataclasses.field(init=False)
language: str = dataclasses.field(init=False)
mode: ModeChoices = dataclasses.field(init=False)
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
init=False,
)
mode: str = dataclasses.field(init=False)
skip_archive_file: str = dataclasses.field(init=False)
image_dpi: int | None = dataclasses.field(init=False)
clean: CleanChoices = dataclasses.field(init=False)
clean: str = dataclasses.field(init=False)
deskew: bool = dataclasses.field(init=False)
rotate: bool = dataclasses.field(init=False)
rotate_threshold: float = dataclasses.field(init=False)
max_image_pixel: float | None = dataclasses.field(init=False)
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
color_conversion_strategy: str = dataclasses.field(init=False)
user_args: dict[str, str] | None = dataclasses.field(init=False)
def __post_init__(self) -> None:
@@ -71,8 +64,8 @@ class OcrConfig(OutputTypeConfig):
self.pages = app_config.pages or settings.OCR_PAGES
self.language = app_config.language or settings.OCR_LANGUAGE
self.mode = app_config.mode or settings.OCR_MODE
self.archive_file_generation = (
app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION
self.skip_archive_file = (
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
)
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN

View File

@@ -1,44 +0,0 @@
# Generated by Django 5.2.12 on 2026-03-26 20:31
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("paperless", "0007_optimize_integer_field_sizes"),
]
operations = [
migrations.RemoveField(
model_name="applicationconfiguration",
name="skip_archive_file",
),
migrations.AddField(
model_name="applicationconfiguration",
name="archive_file_generation",
field=models.CharField(
blank=True,
choices=[("auto", "auto"), ("always", "always"), ("never", "never")],
max_length=8,
null=True,
verbose_name="Controls archive file generation",
),
),
migrations.AlterField(
model_name="applicationconfiguration",
name="mode",
field=models.CharField(
blank=True,
choices=[
("auto", "auto"),
("force", "force"),
("redo", "redo"),
("off", "off"),
],
max_length=16,
null=True,
verbose_name="Sets the OCR mode",
),
),
]

View File

@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
and our own custom setting
"""
AUTO = ("auto", _("auto"))
FORCE = ("force", _("force"))
SKIP = ("skip", _("skip"))
REDO = ("redo", _("redo"))
OFF = ("off", _("off"))
FORCE = ("force", _("force"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
class ArchiveFileGenerationChoices(models.TextChoices):
class ArchiveFileChoices(models.TextChoices):
"""
Settings to control creation of an archive PDF file
"""
AUTO = ("auto", _("auto"))
ALWAYS = ("always", _("always"))
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
ALWAYS = ("always", _("always"))
class CleanChoices(models.TextChoices):
@@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
choices=ModeChoices.choices,
)
archive_file_generation = models.CharField(
verbose_name=_("Controls archive file generation"),
skip_archive_file = models.CharField(
verbose_name=_("Controls the generation of an archive file"),
null=True,
blank=True,
max_length=8,
choices=ArchiveFileGenerationChoices.choices,
max_length=16,
choices=ArchiveFileChoices.choices,
)
image_dpi = models.PositiveSmallIntegerField(

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import importlib.resources
import logging
import os
import re
@@ -9,8 +8,6 @@ import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Any
from typing import Final
from typing import NoReturn
from typing import Self
from django.conf import settings
@@ -21,11 +18,9 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
from paperless.parsers.utils import extract_pdf_text
from paperless.parsers.utils import is_tagged_pdf
from paperless.parsers.utils import read_file_handle_unicode_errors
from paperless.version import __full_version_str__
@@ -38,11 +33,7 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.parsing.tesseract")
_SRGB_ICC_DATA: Final[bytes] = (
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
)
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png",
@@ -108,7 +99,7 @@ class RasterisedDocumentParser:
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object | None = None) -> None:
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self.tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
@@ -242,7 +233,7 @@ class RasterisedDocumentParser:
if (
sidecar_file is not None
and sidecar_file.is_file()
and self.settings.mode != ModeChoices.REDO
and self.settings.mode != "redo"
):
text = read_file_handle_unicode_errors(sidecar_file)
@@ -259,7 +250,36 @@ class RasterisedDocumentParser:
if not Path(pdf_file).is_file():
return None
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
try:
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
run_subprocess(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
str(pdf_file),
tmp.name,
],
logger=self.log,
)
text = read_file_handle_unicode_errors(Path(tmp.name))
return post_process_text(text)
except Exception:
# If pdftotext fails, fall back to OCR.
self.log.warning(
"Error while getting text from PDF document with pdftotext",
exc_info=True,
)
# probably not a PDF file.
return None
def construct_ocrmypdf_parameters(
self,
@@ -269,7 +289,6 @@ class RasterisedDocumentParser:
sidecar_file: Path,
*,
safe_fallback: bool = False,
skip_text: bool = False,
) -> dict[str, Any]:
ocrmypdf_args: dict[str, Any] = {
"input_file_or_options": input_file,
@@ -288,14 +307,15 @@ class RasterisedDocumentParser:
self.settings.color_conversion_strategy
)
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
elif skip_text or self.settings.mode == ModeChoices.OFF:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.AUTO:
pass # no extra flag: normal OCR (text not found case)
else: # pragma: no cover
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
@@ -380,74 +400,6 @@ class RasterisedDocumentParser:
return ocrmypdf_args
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
PDF/A-2b conformance metadata.
No Tesseract and no Ghostscript are invoked.
"""
import img2pdf
import pikepdf
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
try:
layout_fun = None
if self.settings.image_dpi is not None:
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
(self.settings.image_dpi, self.settings.image_dpi),
)
plain_pdf_path.write_bytes(
img2pdf.convert(str(document_path), layout_fun=layout_fun),
)
except Exception as e:
raise ParseError(
f"img2pdf conversion failed for {document_path}: {e!s}",
) from e
pdfa_path = Path(self.tempdir) / "archive.pdf"
try:
with pikepdf.open(plain_pdf_path) as pdf:
cs = pdf.make_stream(_SRGB_ICC_DATA)
cs["/N"] = 3
output_intent = pikepdf.Dictionary(
Type=pikepdf.Name("/OutputIntent"),
S=pikepdf.Name("/GTS_PDFA1"),
OutputConditionIdentifier=pikepdf.String("sRGB"),
DestOutputProfile=cs,
)
pdf.Root["/OutputIntents"] = pdf.make_indirect(
pikepdf.Array([output_intent]),
)
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
meta["pdfaid:part"] = "2"
meta["pdfaid:conformance"] = "B"
pdf.save(pdfa_path)
except Exception as e:
self.log.warning(
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
)
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
return pdfa_path
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
"""Log context for Ghostscript failures and raise ParseError.
Called from the SubprocessOutputError handlers in parse() to avoid
duplicating the Ghostscript hint and re-raise logic.
"""
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: "
"'{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
def parse(
self,
document_path: Path,
@@ -457,94 +409,57 @@ class RasterisedDocumentParser:
) -> None:
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
VALID_TEXT_LENGTH = 50
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = (
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
)
else:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# text located via OCR
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
from ocrmypdf.exceptions import PriorOcrFoundError
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
)
else:
text_original = None
original_has_text = False
# --- OCR_MODE=off: never invoke OCR engine ---
if self.settings.mode == ModeChoices.OFF:
if not produce_archive:
self.text = text_original or ""
return
if self.is_image(mime_type):
try:
self.archive_path = self._convert_image_to_pdfa(
document_path,
)
self.text = ""
except Exception as e:
raise ParseError(
f"Image to PDF/A conversion failed: {e!s}",
) from e
return
# PDFs in off mode: PDF/A conversion only via skip_text
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
skip_text=True,
)
try:
self.log.debug(
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
)
ocrmypdf.ocr(**args)
self.archive_path = archive_path
self.text = self.extract_text(None, archive_path) or text_original or ""
except SubprocessOutputError as e:
self._handle_subprocess_output_error(e)
except Exception as e:
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
return
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
if (
self.settings.mode == ModeChoices.AUTO
and original_has_text
and not produce_archive
):
self.log.debug(
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
)
self.text = text_original
return
# --- All other paths: run ocrmypdf ---
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
# auto mode with existing text: PDF/A conversion only (no OCR).
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
skip_text=skip_text,
)
try:
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if produce_archive:
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
@@ -559,8 +474,16 @@ class RasterisedDocumentParser:
if original_has_text:
self.text = text_original
except SubprocessOutputError as e:
self._handle_subprocess_output_error(e)
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
except (NoTextFoundException, InputFileError) as e:
self.log.warning(
f"Encountered an error while running OCR: {e!s}. "
f"Attempting force OCR to get the text.",
@@ -569,6 +492,8 @@ class RasterisedDocumentParser:
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
# Attempt to run OCR with safe settings.
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
@@ -580,18 +505,25 @@ class RasterisedDocumentParser:
try:
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Don't return the archived file here, since this file
# is bigger and blurry due to --force-ocr.
self.text = self.extract_text(
sidecar_file_fallback,
archive_path_fallback,
)
if produce_archive:
self.archive_path = archive_path_fallback
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.
if not self.text:
if original_has_text:
self.text = text_original

View File

@@ -10,105 +10,15 @@ from __future__ import annotations
import logging
import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Final
if TYPE_CHECKING:
from pathlib import Path
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsers.utils")
# Minimum character count for a PDF to be considered "born-digital" (has real text).
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
PDF_TEXT_MIN_LENGTH: Final[int] = 50
def is_tagged_pdf(
path: Path,
log: logging.Logger | None = None,
) -> bool:
"""Return True if the PDF declares itself as tagged (born-digital indicator).
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
with ``/Marked true`` in the document root. This is a reliable signal
that the document has a logical structure and embedded text — running OCR
on it is unnecessary and archive generation can be skipped.
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
Parameters
----------
path:
Absolute path to the PDF file.
log:
Logger for warnings. Falls back to the module-level logger when omitted.
Returns
-------
bool
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
"""
import pikepdf
_log = log or logger
try:
with pikepdf.open(path) as pdf:
mark_info = pdf.Root.get("/MarkInfo")
if mark_info is None:
return False
return bool(mark_info.get("/Marked", False))
except Exception:
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
return False
def extract_pdf_text(
path: Path,
log: logging.Logger | None = None,
) -> str | None:
"""Run pdftotext on *path* and return the extracted text, or None on failure.
Parameters
----------
path:
Absolute path to the PDF file.
log:
Logger for warnings. Falls back to the module-level logger when omitted.
Returns
-------
str | None
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
"""
from documents.utils import run_subprocess
_log = log or logger
try:
with tempfile.TemporaryDirectory() as tmpdir:
out_path = Path(tmpdir) / "text.txt"
run_subprocess(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
str(path),
str(out_path),
],
logger=_log,
)
text = read_file_handle_unicode_errors(out_path, log=_log)
return text or None
except Exception:
_log.warning(
"Error while getting text from PDF document with pdftotext",
exc_info=True,
)
return None
def read_file_handle_unicode_errors(
filepath: Path,

View File

@@ -880,17 +880,10 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# OCRmyPDF --output-type options are available.
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
OCR_MODE = get_choice_from_env(
"PAPERLESS_OCR_MODE",
{"auto", "force", "redo", "off"},
default="auto",
)
# skip. redo, force
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
ARCHIVE_FILE_GENERATION = get_choice_from_env(
"PAPERLESS_ARCHIVE_FILE_GENERATION",
{"auto", "always", "never"},
default="auto",
)
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")

View File

@@ -708,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
pages=None,
language=None,
mode=None,
archive_file_generation=None,
skip_archive_file=None,
image_dpi=None,
unpaper_clean=None,
deskew=None,

View File

@@ -1,436 +0,0 @@
"""
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
installation and execute quickly. The intent is to verify the *control flow*
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
not to test OCRmyPDF itself.
Fixtures are pulled from conftest.py in this package.
"""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
import pytest
if TYPE_CHECKING:
from pytest_mock import MockerFixture
from paperless.parsers.tesseract import RasterisedDocumentParser
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
_SHORT_TEXT = "Hi." # <50 chars
def _make_extract_text(text: str | None):
"""Return a side_effect function for ``extract_text`` that returns *text*."""
def _extract(sidecar_file, pdf_file):
return text
return _extract
# ---------------------------------------------------------------------------
# AUTO mode — PDF with sufficient text layer
# ---------------------------------------------------------------------------
class TestAutoModeWithText:
"""AUTO mode, original PDF has detectable text (>50 chars)."""
def test_auto_text_no_archive_skips_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called (early return path)
- archive_path remains None
- text is set from the original
"""
# Patch extract_text to return long text (simulating detectable text layer)
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == _LONG_TEXT
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=True
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called with skip_text=True
- archive_path is set
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert call_kwargs.get("skip_text") is True
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
# ---------------------------------------------------------------------------
# AUTO mode — PDF without text layer (or too short)
# ---------------------------------------------------------------------------
class TestAutoModeNoText:
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=True
- PDF with no text (or text <= VALID_TEXT_LENGTH)
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
- archive_path is set (since produce_archive=True)
"""
# Return "no text" for the original; return real text for archive
extract_call_count = 0
def _extract_side(sidecar_file, pdf_file):
nonlocal extract_call_count
extract_call_count += 1
if extract_call_count == 1:
return None # original has no text
return _LONG_TEXT # text from archive after OCR
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert "skip_text" not in call_kwargs
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
def test_auto_no_text_no_archive_calls_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with no text
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called (no early return since no text detected)
- archive_path is NOT set (produce_archive=False)
"""
extract_call_count = 0
def _extract_side(sidecar_file, pdf_file):
nonlocal extract_call_count
extract_call_count += 1
if extract_call_count == 1:
return None
return _LONG_TEXT
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_called_once()
assert tesseract_parser.archive_path is None
# ---------------------------------------------------------------------------
# OFF mode — PDF
# ---------------------------------------------------------------------------
class TestOffModePdf:
"""OCR_MODE=off, document is a PDF."""
def test_off_no_archive_returns_pdftotext(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=False
- PDF with text
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called
- archive_path is None
- text comes from pdftotext (extract_text)
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == _LONG_TEXT
def test_off_with_archive_calls_ocrmypdf_skip_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=True
- PDF document
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
- archive_path is set
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert call_kwargs.get("skip_text") is True
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
# ---------------------------------------------------------------------------
# OFF mode — image
# ---------------------------------------------------------------------------
class TestOffModeImage:
"""OCR_MODE=off, document is an image (PNG)."""
def test_off_image_no_archive_no_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_png_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=False
- Image document (PNG)
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called
- archive_path is None
- text is empty string (images have no text layer)
"""
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == ""
def test_off_image_with_archive_uses_img2pdf_path(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_png_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=True
- Image document (PNG)
WHEN:
- parse() is called
THEN:
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
- archive_path is set to the returned path
- text is empty string
"""
fake_archive = Path("/tmp/fake-archive.pdf")
mock_convert = mocker.patch.object(
tesseract_parser,
"_convert_image_to_pdfa",
return_value=fake_archive,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
mock_convert.assert_called_once_with(simple_png_file)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path == fake_archive
assert tesseract_parser.get_text() == ""
# ---------------------------------------------------------------------------
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
# ---------------------------------------------------------------------------
class TestProduceArchiveFalse:
"""Verify produce_archive=False never results in an archive regardless of mode."""
@pytest.mark.parametrize("mode", ["force", "redo"])
def test_produce_archive_false_force_redo_modes(
self,
mode: str,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- FORCE or REDO mode, produce_archive=False
- Any PDF
WHEN:
- parse() is called (ocrmypdf mocked to succeed)
THEN:
- archive_path is NOT set even though ocrmypdf ran
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = mode
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() is not None
def test_produce_archive_false_auto_with_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf is skipped entirely (early return)
- archive_path is None
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None

View File

@@ -94,35 +94,15 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized (AUTO mode with skip_text=True
triggers skip_text; AUTO mode alone does not add any extra flag)
- Configuration from database is utilized
"""
# AUTO mode with skip_text=True explicitly passed: skip_text is set
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.AUTO
instance.save()
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
skip_text=True,
)
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
# AUTO mode alone (no skip_text): no extra OCR flag is set
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.AUTO
instance.mode = ModeChoices.SKIP
instance.save()
params = self.get_params()
self.assertNotIn("skip_text", params)
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)

View File

@@ -370,26 +370,15 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- Multi-page digital PDF with sufficient text layer
- Default settings (mode=auto, produce_archive=True)
WHEN:
- Document is parsed
THEN:
- Archive is created (AUTO mode + text present + produce_archive=True
→ PDF/A conversion via skip_text)
- Text is extracted
"""
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
tesseract_samples_dir / "simple-digital.pdf",
"application/pdf",
)
assert tesseract_parser.archive_path is not None
assert tesseract_parser.archive_path.is_file()
assert_ordered_substrings(
tesseract_parser.get_text().lower(),
["page 1", "page 2", "page 3"],
tesseract_parser.get_text(),
["This is a test document."],
)
def test_with_form_default(
@@ -408,7 +397,7 @@ class TestParsePdf:
["Please enter your name in here:", "This is a PDF document with a form."],
)
def test_with_form_redo_no_archive_when_not_requested(
def test_with_form_redo_produces_no_archive(
self,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
@@ -417,7 +406,6 @@ class TestParsePdf:
tesseract_parser.parse(
tesseract_samples_dir / "with-form.pdf",
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -445,7 +433,7 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip"
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -461,7 +449,7 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip"
tesseract_parser.parse(
tesseract_samples_dir / "encrypted.pdf",
"application/pdf",
@@ -571,7 +559,7 @@ class TestParseMultiPage:
@pytest.mark.parametrize(
"mode",
[
pytest.param("auto", id="auto"),
pytest.param("skip", id="skip"),
pytest.param("redo", id="redo"),
pytest.param("force", id="force"),
],
@@ -599,7 +587,7 @@ class TestParseMultiPage:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-images.pdf",
"application/pdf",
@@ -747,18 +735,16 @@ class TestSkipArchive:
"""
GIVEN:
- File with existing text layer
- Mode: auto, produce_archive=False
- Mode: skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text extracted from original; no archive created (text exists +
produce_archive=False skips OCRmyPDF entirely)
- Text extracted; no archive created
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -774,13 +760,13 @@ class TestSkipArchive:
"""
GIVEN:
- File with image-only pages (no text layer)
- Mode: auto, skip_archive_file: auto
- Mode: skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text extracted; archive created (OCR needed, no existing text)
- Text extracted; archive created (OCR needed)
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-images.pdf",
"application/pdf",
@@ -792,58 +778,41 @@ class TestSkipArchive:
)
@pytest.mark.parametrize(
("produce_archive", "filename", "expect_archive"),
("skip_archive_file", "filename", "expect_archive"),
[
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
pytest.param(
True,
"multi-page-digital.pdf",
True,
id="produce-archive-with-text",
),
pytest.param(
True,
"multi-page-images.pdf",
True,
id="produce-archive-no-text",
),
pytest.param(
False,
"with_text",
"multi-page-digital.pdf",
False,
id="no-archive-with-text-layer",
id="with-text-layer",
),
pytest.param(
False,
"with_text",
"multi-page-images.pdf",
False,
id="no-archive-no-text-layer",
True,
id="with-text-no-layer",
),
pytest.param(
"always",
"multi-page-digital.pdf",
False,
id="always-with-text",
),
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
],
)
def test_produce_archive_flag(
def test_skip_archive_file_setting(
self,
produce_archive: bool, # noqa: FBT001
skip_archive_file: str,
filename: str,
expect_archive: bool, # noqa: FBT001
expect_archive: str,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- Various PDFs (with and without text layers)
- produce_archive flag set to True or False
WHEN:
- Document is parsed
THEN:
- archive_path is set if and only if produce_archive=True
- Text is always extracted
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / filename,
"application/pdf",
produce_archive=produce_archive,
)
tesseract_parser.settings.skip_archive_file = skip_archive_file
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
text = tesseract_parser.get_text().lower()
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
if expect_archive:
@@ -851,59 +820,6 @@ class TestSkipArchive:
else:
assert tesseract_parser.archive_path is None
def test_tagged_pdf_skips_ocr_in_auto_mode(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
- Mode: auto, produce_archive=False
WHEN:
- Document is parsed
THEN:
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
- Text is extracted from the original via pdftotext
- No archive is produced
"""
tesseract_parser.settings.mode = "auto"
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.parse(
tesseract_samples_dir / "simple-digital.pdf",
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text()
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
self,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
- Mode: auto, produce_archive=True
WHEN:
- Document is parsed
THEN:
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
- Archive is produced
- Text is preserved from the original
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "simple-digital.pdf",
"application/pdf",
produce_archive=True,
)
assert tesseract_parser.archive_path is not None
assert tesseract_parser.get_text()
# ---------------------------------------------------------------------------
# Parse — mixed pages / sidecar
@@ -919,13 +835,13 @@ class TestParseMixed:
"""
GIVEN:
- File with text in some pages (image) and some pages (digital)
- Mode: auto (skip_text), skip_archive_file: always
- Mode: skip
WHEN:
- Document is parsed
THEN:
- All pages extracted; archive created; sidecar notes skipped pages
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-mixed.pdf",
"application/pdf",
@@ -982,18 +898,17 @@ class TestParseMixed:
) -> None:
"""
GIVEN:
- File with mixed pages (some with text, some image-only)
- Mode: auto, produce_archive=False
- File with mixed pages
- Mode: skip_noarchive
WHEN:
- Document is parsed
THEN:
- No archive created (produce_archive=False); text from text layer present
- No archive created (file has text layer); later-page text present
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-mixed.pdf",
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -1008,12 +923,12 @@ class TestParseMixed:
class TestParseRotate:
def test_rotate_auto_mode(
def test_rotate_skip_mode(
self,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.rotate = True
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
assert_ordered_substrings(
@@ -1040,19 +955,12 @@ class TestParseRtl:
) -> None:
"""
GIVEN:
- PDF with RTL Arabic text in its text layer (short: 18 chars)
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
- PDF with RTL Arabic text
WHEN:
- Document is parsed
THEN:
- Arabic content is extracted from the PDF text layer (normalised for bidi)
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
- Arabic content is extracted (normalised for bidi)
"""
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
tesseract_samples_dir / "rtl-test.pdf",
"application/pdf",
@@ -1115,11 +1023,11 @@ class TestOcrmypdfParameters:
assert ("clean" in params) == expected_clean
assert ("clean_final" in params) == expected_clean_final
def test_clean_final_auto_mode(
def test_clean_final_skip_mode(
self,
make_tesseract_parser: MakeTesseractParser,
) -> None:
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
params = parser.construct_ocrmypdf_parameters("", "", "", "")
assert params["clean_final"] is True
assert "clean" not in params
@@ -1136,9 +1044,9 @@ class TestOcrmypdfParameters:
@pytest.mark.parametrize(
("ocr_mode", "ocr_deskew", "expect_deskew"),
[
pytest.param("auto", True, True, id="auto-deskew-on"),
pytest.param("skip", True, True, id="skip-deskew-on"),
pytest.param("redo", True, False, id="redo-deskew-off"),
pytest.param("auto", False, False, id="auto-no-deskew"),
pytest.param("skip", False, False, id="skip-no-deskew"),
],
)
def test_deskew_option(

View File

@@ -132,13 +132,13 @@ class TestOcrSettingsChecks:
pytest.param(
"OCR_MODE",
"skip_noarchive",
'OCR output mode "skip_noarchive"',
id="deprecated-mode-now-invalid",
"deprecated",
id="deprecated-mode",
),
pytest.param(
"ARCHIVE_FILE_GENERATION",
"OCR_SKIP_ARCHIVE_FILE",
"invalid",
'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"',
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
id="invalid-skip-archive-file",
),
pytest.param(

View File

@@ -1,64 +0,0 @@
"""Tests for v3 system checks: deprecated v2 OCR env var warnings."""
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
from paperless.checks import check_deprecated_v2_ocr_env_vars
if TYPE_CHECKING:
from pytest_mock import MockerFixture
class TestDeprecatedV2OcrEnvVarWarnings:
def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None:
"""No warnings when neither deprecated variable is set."""
mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True)
result = check_deprecated_v2_ocr_env_vars(None)
assert result == []
@pytest.mark.parametrize(
("env_var", "env_value", "expected_id", "expected_fragment"),
[
pytest.param(
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
"always",
"paperless.W002",
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
id="skip-archive-file-warns",
),
pytest.param(
"PAPERLESS_OCR_MODE",
"skip",
"paperless.W003",
"skip",
id="ocr-mode-skip-warns",
),
pytest.param(
"PAPERLESS_OCR_MODE",
"skip_noarchive",
"paperless.W003",
"skip_noarchive",
id="ocr-mode-skip-noarchive-warns",
),
],
)
def test_deprecated_var_produces_one_warning(
self,
mocker: MockerFixture,
env_var: str,
env_value: str,
expected_id: str,
expected_fragment: str,
) -> None:
"""Each deprecated setting in isolation produces exactly one warning."""
mocker.patch.dict(os.environ, {env_var: env_value}, clear=True)
result = check_deprecated_v2_ocr_env_vars(None)
assert len(result) == 1
warning = result[0]
assert warning.id == expected_id
assert expected_fragment in warning.msg

View File

@@ -1,66 +0,0 @@
"""Tests for OcrConfig archive_file_generation field behavior."""
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from django.test import override_settings
from paperless.config import OcrConfig
if TYPE_CHECKING:
from unittest.mock import MagicMock
@pytest.fixture()
def null_app_config(mocker) -> MagicMock:
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
return mocker.MagicMock(
output_type=None,
pages=None,
language=None,
mode=None,
archive_file_generation=None,
image_dpi=None,
unpaper_clean=None,
deskew=None,
rotate_pages=None,
rotate_pages_threshold=None,
max_image_pixels=None,
color_conversion_strategy=None,
user_args=None,
)
@pytest.fixture()
def make_ocr_config(mocker, null_app_config):
mocker.patch(
"paperless.config.BaseConfig._get_config_instance",
return_value=null_app_config,
)
def _make(**django_settings_overrides):
with override_settings(**django_settings_overrides):
return OcrConfig()
return _make
class TestOcrConfigArchiveFileGeneration:
def test_auto_from_settings(self, make_ocr_config) -> None:
cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto")
assert cfg.archive_file_generation == "auto"
def test_always_from_settings(self, make_ocr_config) -> None:
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
assert cfg.archive_file_generation == "always"
def test_never_from_settings(self, make_ocr_config) -> None:
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never")
assert cfg.archive_file_generation == "never"
def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None:
null_app_config.archive_file_generation = "never"
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
assert cfg.archive_file_generation == "never"

View File

@@ -1,25 +0,0 @@
"""Tests for paperless.parsers.utils helpers."""
from __future__ import annotations
from pathlib import Path
from paperless.parsers.utils import is_tagged_pdf
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
class TestIsTaggedPdf:
def test_tagged_pdf_returns_true(self) -> None:
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
def test_untagged_pdf_returns_false(self) -> None:
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
def test_nonexistent_path_returns_false(self) -> None:
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
bad = tmp_path / "bad.pdf"
bad.write_bytes(b"not a pdf")
assert is_tagged_pdf(bad) is False