diff --git a/docs/api.md b/docs/api.md index 2284d9d29..af1190f3d 100644 --- a/docs/api.md +++ b/docs/api.md @@ -62,10 +62,14 @@ The REST api provides five different forms of authentication. ## Searching for documents -Full text searching is available on the `/api/documents/` endpoint. Two -specific query parameters cause the API to return full text search +Full text searching is available on the `/api/documents/` endpoint. The +following query parameters cause the API to return Tantivy-backed search results: +- `/api/documents/?text=your%20search%20query`: Search title and content + using simple substring-style search. +- `/api/documents/?title_search=your%20search%20query`: Search title only + using simple substring-style search. - `/api/documents/?query=your%20search%20query`: Search for a document using a full text query. For details on the syntax, see [Basic Usage - Searching](usage.md#basic-usage_searching). - `/api/documents/?more_like_id=1234`: Search for documents similar to @@ -439,3 +443,5 @@ Initial API version. - The `all` parameter of list endpoints is now deprecated and will be removed in a future version. - The bulk edit objects endpoint now supports `all` and `filters` parameters to avoid having to send large lists of object IDs for operations affecting many objects. +- The legacy `title_content` document search parameter is deprecated and will be removed in a future version. + Clients should use `text` for simple title-and-content search and `title_search` for title-only search. diff --git a/src-ui/e2e/document-list/document-list.spec.ts b/src-ui/e2e/document-list/document-list.spec.ts index 700304186..0cea8effa 100644 --- a/src-ui/e2e/document-list/document-list.spec.ts +++ b/src-ui/e2e/document-list/document-list.spec.ts @@ -49,11 +49,11 @@ test('text filtering', async ({ page }) => { await page.getByRole('main').getByRole('combobox').click() await page.getByRole('main').getByRole('combobox').fill('test') await expect(page.locator('pngx-document-list')).toHaveText(/32 documents/) - await expect(page).toHaveURL(/title_content=test/) + await expect(page).toHaveURL(/text=test/) await page.getByRole('button', { name: 'Title & content' }).click() await page.getByRole('button', { name: 'Title', exact: true }).click() await expect(page.locator('pngx-document-list')).toHaveText(/9 documents/) - await expect(page).toHaveURL(/title__icontains=test/) + await expect(page).toHaveURL(/title_search=test/) await page.getByRole('button', { name: 'Title', exact: true }).click() await page.getByRole('button', { name: 'Advanced search' }).click() await expect(page).toHaveURL(/query=test/) diff --git a/src-ui/e2e/document-list/requests/api-document-list2.har b/src-ui/e2e/document-list/requests/api-document-list2.har index 3cbc9e8a6..f6a488b26 100644 --- a/src-ui/e2e/document-list/requests/api-document-list2.har +++ b/src-ui/e2e/document-list/requests/api-document-list2.har @@ -3545,7 +3545,7 @@ "time": 1.091, "request": { "method": "GET", - "url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_content=test", + "url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&text=test", "httpVersion": "HTTP/1.1", "cookies": [], "headers": [ @@ -3579,7 +3579,7 @@ "value": "true" }, { - "name": "title_content", + "name": "text", "value": "test" } ], @@ -4303,7 +4303,7 @@ "time": 0.603, "request": { "method": "GET", - "url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title__icontains=test", + "url": "http://localhost:8000/api/documents/?page=1&page_size=50&ordering=-created&truncate_content=true&include_selection_data=true&title_search=test", "httpVersion": "HTTP/1.1", "cookies": [], "headers": [ @@ -4337,7 +4337,7 @@ "value": "true" }, { - "name": "title__icontains", + "name": "title_search", "value": "test" } ], diff --git a/src-ui/src/app/components/app-frame/global-search/global-search.component.spec.ts b/src-ui/src/app/components/app-frame/global-search/global-search.component.spec.ts index eaae4a814..1be801478 100644 --- a/src-ui/src/app/components/app-frame/global-search/global-search.component.spec.ts +++ b/src-ui/src/app/components/app-frame/global-search/global-search.component.spec.ts @@ -24,7 +24,7 @@ import { FILTER_HAS_DOCUMENT_TYPE_ANY, FILTER_HAS_STORAGE_PATH_ANY, FILTER_HAS_TAGS_ALL, - FILTER_TITLE_CONTENT, + FILTER_SIMPLE_TEXT, } from 'src/app/data/filter-rule-type' import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings' import { DocumentListViewService } from 'src/app/services/document-list-view.service' @@ -545,7 +545,7 @@ describe('GlobalSearchComponent', () => { component.query = 'test' component.runFullSearch() expect(qfSpy).toHaveBeenCalledWith([ - { rule_type: FILTER_TITLE_CONTENT, value: 'test' }, + { rule_type: FILTER_SIMPLE_TEXT, value: 'test' }, ]) settingsService.set( diff --git a/src-ui/src/app/components/app-frame/global-search/global-search.component.ts b/src-ui/src/app/components/app-frame/global-search/global-search.component.ts index 4f9a2467c..e95b52cfc 100644 --- a/src-ui/src/app/components/app-frame/global-search/global-search.component.ts +++ b/src-ui/src/app/components/app-frame/global-search/global-search.component.ts @@ -25,7 +25,7 @@ import { FILTER_HAS_DOCUMENT_TYPE_ANY, FILTER_HAS_STORAGE_PATH_ANY, FILTER_HAS_TAGS_ALL, - FILTER_TITLE_CONTENT, + FILTER_SIMPLE_TEXT, } from 'src/app/data/filter-rule-type' import { ObjectWithId } from 'src/app/data/object-with-id' import { GlobalSearchType, SETTINGS_KEYS } from 'src/app/data/ui-settings' @@ -410,7 +410,7 @@ export class GlobalSearchComponent implements OnInit { public runFullSearch() { const ruleType = this.useAdvancedForFullSearch ? FILTER_FULLTEXT_QUERY - : FILTER_TITLE_CONTENT + : FILTER_SIMPLE_TEXT this.documentService.searchQuery = this.useAdvancedForFullSearch ? this.query : '' diff --git a/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.spec.ts b/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.spec.ts index 89e7b1fee..2466ced73 100644 --- a/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.spec.ts +++ b/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.spec.ts @@ -4,7 +4,7 @@ import { ComponentFixture, TestBed } from '@angular/core/testing' import { By } from '@angular/platform-browser' import { NgbAccordionButton, NgbActiveModal } from '@ng-bootstrap/ng-bootstrap' import { of, throwError } from 'rxjs' -import { FILTER_TITLE } from 'src/app/data/filter-rule-type' +import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type' import { DocumentService } from 'src/app/services/rest/document.service' import { StoragePathService } from 'src/app/services/rest/storage-path.service' import { SettingsService } from 'src/app/services/settings.service' @@ -105,7 +105,7 @@ describe('StoragePathEditDialogComponent', () => { null, 'created', true, - [{ rule_type: FILTER_TITLE, value: 'bar' }], + [{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }], { truncate_content: true } ) listSpy.mockReturnValueOnce( diff --git a/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.ts b/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.ts index f06831588..68ce40f5e 100644 --- a/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.ts +++ b/src-ui/src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.ts @@ -23,7 +23,7 @@ import { } from 'rxjs' import { EditDialogComponent } from 'src/app/components/common/edit-dialog/edit-dialog.component' import { Document } from 'src/app/data/document' -import { FILTER_TITLE } from 'src/app/data/filter-rule-type' +import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type' import { DEFAULT_MATCHING_ALGORITHM } from 'src/app/data/matching-model' import { StoragePath } from 'src/app/data/storage-path' import { IfOwnerDirective } from 'src/app/directives/if-owner.directive' @@ -146,7 +146,7 @@ export class StoragePathEditDialogComponent null, 'created', true, - [{ rule_type: FILTER_TITLE, value: title }], + [{ rule_type: FILTER_SIMPLE_TITLE, value: title }], { truncate_content: true } ) .pipe( diff --git a/src-ui/src/app/components/common/input/document-link/document-link.component.spec.ts b/src-ui/src/app/components/common/input/document-link/document-link.component.spec.ts index 7021012ab..f8a8f3817 100644 --- a/src-ui/src/app/components/common/input/document-link/document-link.component.spec.ts +++ b/src-ui/src/app/components/common/input/document-link/document-link.component.spec.ts @@ -3,7 +3,7 @@ import { provideHttpClientTesting } from '@angular/common/http/testing' import { ComponentFixture, TestBed } from '@angular/core/testing' import { NG_VALUE_ACCESSOR } from '@angular/forms' import { of, throwError } from 'rxjs' -import { FILTER_TITLE } from 'src/app/data/filter-rule-type' +import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type' import { DocumentService } from 'src/app/services/rest/document.service' import { DocumentLinkComponent } from './document-link.component' @@ -99,7 +99,7 @@ describe('DocumentLinkComponent', () => { null, 'created', true, - [{ rule_type: FILTER_TITLE, value: 'bar' }], + [{ rule_type: FILTER_SIMPLE_TITLE, value: 'bar' }], { truncate_content: true } ) listSpy.mockReturnValueOnce(throwError(() => new Error())) diff --git a/src-ui/src/app/components/common/input/document-link/document-link.component.ts b/src-ui/src/app/components/common/input/document-link/document-link.component.ts index b50f5701d..9bfb60063 100644 --- a/src-ui/src/app/components/common/input/document-link/document-link.component.ts +++ b/src-ui/src/app/components/common/input/document-link/document-link.component.ts @@ -28,7 +28,7 @@ import { tap, } from 'rxjs' import { Document } from 'src/app/data/document' -import { FILTER_TITLE } from 'src/app/data/filter-rule-type' +import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type' import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe' import { DocumentService } from 'src/app/services/rest/document.service' import { AbstractInputComponent } from '../abstract-input' @@ -121,7 +121,7 @@ export class DocumentLinkComponent null, 'created', true, - [{ rule_type: FILTER_TITLE, value: title }], + [{ rule_type: FILTER_SIMPLE_TITLE, value: title }], { truncate_content: true } ) .pipe( diff --git a/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.spec.ts b/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.spec.ts index f283a75f3..8f82be1ab 100644 --- a/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.spec.ts +++ b/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.spec.ts @@ -428,7 +428,7 @@ describe('BulkEditorComponent', () => { req.flush(true) expect(req.request.body).toEqual({ all: true, - filters: { title__icontains: 'apple' }, + filters: { title_search: 'apple' }, method: 'modify_tags', parameters: { add_tags: [101], remove_tags: [] }, }) diff --git a/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.spec.ts b/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.spec.ts index bf5240f1b..d75e38630 100644 --- a/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.spec.ts +++ b/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.spec.ts @@ -67,6 +67,8 @@ import { FILTER_OWNER_DOES_NOT_INCLUDE, FILTER_OWNER_ISNULL, FILTER_SHARED_BY_USER, + FILTER_SIMPLE_TEXT, + FILTER_SIMPLE_TITLE, FILTER_STORAGE_PATH, FILTER_TITLE, FILTER_TITLE_CONTENT, @@ -312,7 +314,7 @@ describe('FilterEditorComponent', () => { expect(component.textFilter).toEqual(null) component.filterRules = [ { - rule_type: FILTER_TITLE_CONTENT, + rule_type: FILTER_SIMPLE_TEXT, value: 'foo', }, ] @@ -320,6 +322,18 @@ describe('FilterEditorComponent', () => { expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT })) + it('should ingest legacy text filter rules for doc title + content', fakeAsync(() => { + expect(component.textFilter).toEqual(null) + component.filterRules = [ + { + rule_type: FILTER_TITLE_CONTENT, + value: 'legacy foo', + }, + ] + expect(component.textFilter).toEqual('legacy foo') + expect(component.textFilterTarget).toEqual('title-content') // TEXT_FILTER_TARGET_TITLE_CONTENT + })) + it('should ingest text filter rules for doc asn', fakeAsync(() => { expect(component.textFilter).toEqual(null) component.filterRules = [ @@ -1117,7 +1131,7 @@ describe('FilterEditorComponent', () => { expect(component.textFilter).toEqual('foo') expect(component.filterRules).toEqual([ { - rule_type: FILTER_TITLE_CONTENT, + rule_type: FILTER_SIMPLE_TEXT, value: 'foo', }, ]) @@ -1136,7 +1150,7 @@ describe('FilterEditorComponent', () => { expect(component.textFilterTarget).toEqual('title') expect(component.filterRules).toEqual([ { - rule_type: FILTER_TITLE, + rule_type: FILTER_SIMPLE_TITLE, value: 'foo', }, ]) @@ -1250,30 +1264,12 @@ describe('FilterEditorComponent', () => { ]) })) - it('should convert user input to correct filter rules on custom fields query', fakeAsync(() => { - component.textFilterInput.nativeElement.value = 'foo' - component.textFilterInput.nativeElement.dispatchEvent(new Event('input')) - const textFieldTargetDropdown = fixture.debugElement.queryAll( - By.directive(NgbDropdownItem) - )[3] - textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_CUSTOM_FIELDS - fixture.detectChanges() - tick(400) - expect(component.textFilterTarget).toEqual('custom-fields') - expect(component.filterRules).toEqual([ - { - rule_type: FILTER_CUSTOM_FIELDS_TEXT, - value: 'foo', - }, - ]) - })) - it('should convert user input to correct filter rules on mime type', fakeAsync(() => { component.textFilterInput.nativeElement.value = 'pdf' component.textFilterInput.nativeElement.dispatchEvent(new Event('input')) const textFieldTargetDropdown = fixture.debugElement.queryAll( By.directive(NgbDropdownItem) - )[4] + )[3] textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_MIME_TYPE fixture.detectChanges() tick(400) @@ -1291,8 +1287,8 @@ describe('FilterEditorComponent', () => { component.textFilterInput.nativeElement.dispatchEvent(new Event('input')) const textFieldTargetDropdown = fixture.debugElement.queryAll( By.directive(NgbDropdownItem) - )[5] - textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_ASN + )[4] + textFieldTargetDropdown.triggerEventHandler('click') // TEXT_FILTER_TARGET_FULLTEXT_QUERY fixture.detectChanges() tick(400) expect(component.textFilterTarget).toEqual('fulltext-query') @@ -1696,12 +1692,56 @@ describe('FilterEditorComponent', () => { ]) })) + it('should convert legacy title filters into full text query when adding a created relative date', fakeAsync(() => { + component.filterRules = [ + { + rule_type: FILTER_TITLE, + value: 'foo', + }, + ] + const dateCreatedDropdown = fixture.debugElement.queryAll( + By.directive(DatesDropdownComponent) + )[0] + component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK + dateCreatedDropdown.triggerEventHandler('datesSet') + fixture.detectChanges() + tick(400) + expect(component.filterRules).toEqual([ + { + rule_type: FILTER_FULLTEXT_QUERY, + value: 'foo,created:[-1 week to now]', + }, + ]) + })) + + it('should convert simple title filters into full text query when adding a created relative date', fakeAsync(() => { + component.filterRules = [ + { + rule_type: FILTER_SIMPLE_TITLE, + value: 'foo', + }, + ] + const dateCreatedDropdown = fixture.debugElement.queryAll( + By.directive(DatesDropdownComponent) + )[0] + component.dateCreatedRelativeDate = RelativeDate.WITHIN_1_WEEK + dateCreatedDropdown.triggerEventHandler('datesSet') + fixture.detectChanges() + tick(400) + expect(component.filterRules).toEqual([ + { + rule_type: FILTER_FULLTEXT_QUERY, + value: 'foo,created:[-1 week to now]', + }, + ]) + })) + it('should leave relative dates not in quick list intact', fakeAsync(() => { component.textFilterInput.nativeElement.value = 'created:[-2 week to now]' component.textFilterInput.nativeElement.dispatchEvent(new Event('input')) const textFieldTargetDropdown = fixture.debugElement.queryAll( By.directive(NgbDropdownItem) - )[5] + )[4] textFieldTargetDropdown.triggerEventHandler('click') fixture.detectChanges() tick(400) @@ -2031,12 +2071,30 @@ describe('FilterEditorComponent', () => { component.filterRules = [ { - rule_type: FILTER_TITLE, + rule_type: FILTER_SIMPLE_TITLE, value: 'foo', }, ] expect(component.generateFilterName()).toEqual('Title: foo') + component.filterRules = [ + { + rule_type: FILTER_TITLE_CONTENT, + value: 'legacy foo', + }, + ] + expect(component.generateFilterName()).toEqual( + 'Title & content: legacy foo' + ) + + component.filterRules = [ + { + rule_type: FILTER_SIMPLE_TEXT, + value: 'foo', + }, + ] + expect(component.generateFilterName()).toEqual('Title & content: foo') + component.filterRules = [ { rule_type: FILTER_ASN, @@ -2156,6 +2214,36 @@ describe('FilterEditorComponent', () => { }) }) + it('should hide deprecated custom fields target from default text filter targets', () => { + expect(component.textFilterTargets).not.toContainEqual({ + id: 'custom-fields', + name: $localize`Custom fields (Deprecated)`, + }) + }) + + it('should keep deprecated custom fields target available for legacy filters', fakeAsync(() => { + component.filterRules = [ + { + rule_type: FILTER_CUSTOM_FIELDS_TEXT, + value: 'foo', + }, + ] + fixture.detectChanges() + tick() + + expect(component.textFilterTarget).toEqual('custom-fields') + expect(component.textFilterTargets).toContainEqual({ + id: 'custom-fields', + name: $localize`Custom fields (Deprecated)`, + }) + expect(component.filterRules).toEqual([ + { + rule_type: FILTER_CUSTOM_FIELDS_TEXT, + value: 'foo', + }, + ]) + })) + it('should call autocomplete endpoint on input', fakeAsync(() => { component.textFilterTarget = 'fulltext-query' // TEXT_FILTER_TARGET_FULLTEXT_QUERY const autocompleteSpy = jest.spyOn(searchService, 'autocomplete') diff --git a/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.ts b/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.ts index f7b50181b..b4e63317a 100644 --- a/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.ts +++ b/src-ui/src/app/components/document-list/filter-editor/filter-editor.component.ts @@ -71,6 +71,8 @@ import { FILTER_OWNER_DOES_NOT_INCLUDE, FILTER_OWNER_ISNULL, FILTER_SHARED_BY_USER, + FILTER_SIMPLE_TEXT, + FILTER_SIMPLE_TITLE, FILTER_STORAGE_PATH, FILTER_TITLE, FILTER_TITLE_CONTENT, @@ -195,10 +197,6 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [ name: $localize`Title & content`, }, { id: TEXT_FILTER_TARGET_ASN, name: $localize`ASN` }, - { - id: TEXT_FILTER_TARGET_CUSTOM_FIELDS, - name: $localize`Custom fields`, - }, { id: TEXT_FILTER_TARGET_MIME_TYPE, name: $localize`File type` }, { id: TEXT_FILTER_TARGET_FULLTEXT_QUERY, @@ -206,6 +204,12 @@ const DEFAULT_TEXT_FILTER_TARGET_OPTIONS = [ }, ] +const DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION = { + // Kept only so legacy saved views can render and be edited away from, remove me eventually + id: TEXT_FILTER_TARGET_CUSTOM_FIELDS, + name: $localize`Custom fields (Deprecated)`, +} + const TEXT_FILTER_TARGET_MORELIKE_OPTION = { id: TEXT_FILTER_TARGET_FULLTEXT_MORELIKE, name: $localize`More like`, @@ -318,8 +322,13 @@ export class FilterEditorComponent return $localize`Custom fields query` case FILTER_TITLE: + case FILTER_SIMPLE_TITLE: return $localize`Title: ${rule.value}` + case FILTER_TITLE_CONTENT: + case FILTER_SIMPLE_TEXT: + return $localize`Title & content: ${rule.value}` + case FILTER_ASN: return $localize`ASN: ${rule.value}` @@ -353,12 +362,16 @@ export class FilterEditorComponent _moreLikeDoc: Document get textFilterTargets() { + let targets = DEFAULT_TEXT_FILTER_TARGET_OPTIONS if (this.textFilterTarget == TEXT_FILTER_TARGET_FULLTEXT_MORELIKE) { - return DEFAULT_TEXT_FILTER_TARGET_OPTIONS.concat([ - TEXT_FILTER_TARGET_MORELIKE_OPTION, + targets = targets.concat([TEXT_FILTER_TARGET_MORELIKE_OPTION]) + } + if (this.textFilterTarget == TEXT_FILTER_TARGET_CUSTOM_FIELDS) { + targets = targets.concat([ + DEPRECATED_CUSTOM_FIELDS_TEXT_FILTER_TARGET_OPTION, ]) } - return DEFAULT_TEXT_FILTER_TARGET_OPTIONS + return targets } textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT @@ -437,10 +450,12 @@ export class FilterEditorComponent value.forEach((rule) => { switch (rule.rule_type) { case FILTER_TITLE: + case FILTER_SIMPLE_TITLE: this._textFilter = rule.value this.textFilterTarget = TEXT_FILTER_TARGET_TITLE break case FILTER_TITLE_CONTENT: + case FILTER_SIMPLE_TEXT: this._textFilter = rule.value this.textFilterTarget = TEXT_FILTER_TARGET_TITLE_CONTENT break @@ -762,12 +777,15 @@ export class FilterEditorComponent this.textFilterTarget == TEXT_FILTER_TARGET_TITLE_CONTENT ) { filterRules.push({ - rule_type: FILTER_TITLE_CONTENT, + rule_type: FILTER_SIMPLE_TEXT, value: this._textFilter.trim(), }) } if (this._textFilter && this.textFilterTarget == TEXT_FILTER_TARGET_TITLE) { - filterRules.push({ rule_type: FILTER_TITLE, value: this._textFilter }) + filterRules.push({ + rule_type: FILTER_SIMPLE_TITLE, + value: this._textFilter, + }) } if (this.textFilterTarget == TEXT_FILTER_TARGET_ASN) { if ( @@ -1009,7 +1027,10 @@ export class FilterEditorComponent ) { existingRule = filterRules.find( (fr) => - fr.rule_type == FILTER_TITLE_CONTENT || fr.rule_type == FILTER_TITLE + fr.rule_type == FILTER_TITLE_CONTENT || + fr.rule_type == FILTER_SIMPLE_TEXT || + fr.rule_type == FILTER_TITLE || + fr.rule_type == FILTER_SIMPLE_TITLE ) existingRule.rule_type = FILTER_FULLTEXT_QUERY } diff --git a/src-ui/src/app/data/filter-rule-type.ts b/src-ui/src/app/data/filter-rule-type.ts index 7f0f0d56d..6330eb44c 100644 --- a/src-ui/src/app/data/filter-rule-type.ts +++ b/src-ui/src/app/data/filter-rule-type.ts @@ -3,7 +3,7 @@ import { DataType } from './datatype' export const NEGATIVE_NULL_FILTER_VALUE = -1 // These correspond to src/documents/models.py and changes here require a DB migration (and vice versa) -export const FILTER_TITLE = 0 +export const FILTER_TITLE = 0 // Deprecated in favor of Tantivy-backed `title_search`. Keep for now for existing saved views export const FILTER_CONTENT = 1 export const FILTER_ASN = 2 @@ -46,7 +46,9 @@ export const FILTER_ADDED_FROM = 46 export const FILTER_MODIFIED_BEFORE = 15 export const FILTER_MODIFIED_AFTER = 16 -export const FILTER_TITLE_CONTENT = 19 +export const FILTER_TITLE_CONTENT = 19 // Deprecated in favor of Tantivy-backed `text` filtervar. Keep for now for existing saved views +export const FILTER_SIMPLE_TITLE = 48 +export const FILTER_SIMPLE_TEXT = 49 export const FILTER_FULLTEXT_QUERY = 20 export const FILTER_FULLTEXT_MORELIKE = 21 @@ -56,7 +58,7 @@ export const FILTER_OWNER_ISNULL = 34 export const FILTER_OWNER_DOES_NOT_INCLUDE = 35 export const FILTER_SHARED_BY_USER = 37 -export const FILTER_CUSTOM_FIELDS_TEXT = 36 +export const FILTER_CUSTOM_FIELDS_TEXT = 36 // Deprecated. UI no longer includes CF text-search mode. Keep for now for existing saved views export const FILTER_HAS_CUSTOM_FIELDS_ALL = 38 export const FILTER_HAS_CUSTOM_FIELDS_ANY = 39 export const FILTER_DOES_NOT_HAVE_CUSTOM_FIELDS = 40 @@ -66,6 +68,9 @@ export const FILTER_CUSTOM_FIELDS_QUERY = 42 export const FILTER_MIME_TYPE = 47 +export const SIMPLE_TEXT_PARAMETER = 'text' +export const SIMPLE_TITLE_PARAMETER = 'title_search' + export const FILTER_RULE_TYPES: FilterRuleType[] = [ { id: FILTER_TITLE, @@ -74,6 +79,13 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [ multi: false, default: '', }, + { + id: FILTER_SIMPLE_TITLE, + filtervar: SIMPLE_TITLE_PARAMETER, + datatype: 'string', + multi: false, + default: '', + }, { id: FILTER_CONTENT, filtervar: 'content__icontains', @@ -279,6 +291,12 @@ export const FILTER_RULE_TYPES: FilterRuleType[] = [ datatype: 'string', multi: false, }, + { + id: FILTER_SIMPLE_TEXT, + filtervar: SIMPLE_TEXT_PARAMETER, + datatype: 'string', + multi: false, + }, { id: FILTER_FULLTEXT_QUERY, filtervar: 'query', diff --git a/src-ui/src/app/services/rest/document.service.spec.ts b/src-ui/src/app/services/rest/document.service.spec.ts index 711aab743..03375e367 100644 --- a/src-ui/src/app/services/rest/document.service.spec.ts +++ b/src-ui/src/app/services/rest/document.service.spec.ts @@ -10,7 +10,7 @@ import { DOCUMENT_SORT_FIELDS, DOCUMENT_SORT_FIELDS_FULLTEXT, } from 'src/app/data/document' -import { FILTER_TITLE } from 'src/app/data/filter-rule-type' +import { FILTER_SIMPLE_TITLE } from 'src/app/data/filter-rule-type' import { SETTINGS_KEYS } from 'src/app/data/ui-settings' import { environment } from 'src/environments/environment' import { PermissionsService } from '../permissions.service' @@ -138,13 +138,13 @@ describe(`DocumentService`, () => { subscription = service .listAllFilteredIds([ { - rule_type: FILTER_TITLE, + rule_type: FILTER_SIMPLE_TITLE, value: 'apple', }, ]) .subscribe() const req = httpTestingController.expectOne( - `${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title__icontains=apple` + `${environment.apiBaseUrl}${endpoint}/?page=1&page_size=100000&fields=id&title_search=apple` ) expect(req.request.method).toEqual('GET') }) diff --git a/src-ui/src/app/utils/query-params.spec.ts b/src-ui/src/app/utils/query-params.spec.ts index c22c90d11..7fd8f6808 100644 --- a/src-ui/src/app/utils/query-params.spec.ts +++ b/src-ui/src/app/utils/query-params.spec.ts @@ -8,6 +8,10 @@ import { FILTER_HAS_CUSTOM_FIELDS_ALL, FILTER_HAS_CUSTOM_FIELDS_ANY, FILTER_HAS_TAGS_ALL, + FILTER_SIMPLE_TEXT, + FILTER_SIMPLE_TITLE, + FILTER_TITLE, + FILTER_TITLE_CONTENT, NEGATIVE_NULL_FILTER_VALUE, } from '../data/filter-rule-type' import { @@ -128,6 +132,26 @@ describe('QueryParams Utils', () => { is_tagged: 0, }) + params = queryParamsFromFilterRules([ + { + rule_type: FILTER_TITLE_CONTENT, + value: 'bank statement', + }, + ]) + expect(params).toEqual({ + text: 'bank statement', + }) + + params = queryParamsFromFilterRules([ + { + rule_type: FILTER_TITLE, + value: 'invoice', + }, + ]) + expect(params).toEqual({ + title_search: 'invoice', + }) + params = queryParamsFromFilterRules([ { rule_type: FILTER_HAS_TAGS_ALL, @@ -148,6 +172,30 @@ describe('QueryParams Utils', () => { it('should convert filter rules to query params', () => { let rules = filterRulesFromQueryParams( + convertToParamMap({ + text: 'bank statement', + }) + ) + expect(rules).toEqual([ + { + rule_type: FILTER_SIMPLE_TEXT, + value: 'bank statement', + }, + ]) + + rules = filterRulesFromQueryParams( + convertToParamMap({ + title_search: 'invoice', + }) + ) + expect(rules).toEqual([ + { + rule_type: FILTER_SIMPLE_TITLE, + value: 'invoice', + }, + ]) + + rules = filterRulesFromQueryParams( convertToParamMap({ tags__id__all, }) diff --git a/src-ui/src/app/utils/query-params.ts b/src-ui/src/app/utils/query-params.ts index 27716cc2d..be33ba724 100644 --- a/src-ui/src/app/utils/query-params.ts +++ b/src-ui/src/app/utils/query-params.ts @@ -9,8 +9,14 @@ import { FILTER_HAS_CUSTOM_FIELDS_ALL, FILTER_HAS_CUSTOM_FIELDS_ANY, FILTER_RULE_TYPES, + FILTER_SIMPLE_TEXT, + FILTER_SIMPLE_TITLE, + FILTER_TITLE, + FILTER_TITLE_CONTENT, FilterRuleType, NEGATIVE_NULL_FILTER_VALUE, + SIMPLE_TEXT_PARAMETER, + SIMPLE_TITLE_PARAMETER, } from '../data/filter-rule-type' import { ListViewState } from '../services/document-list-view.service' @@ -97,6 +103,8 @@ export function transformLegacyFilterRules( export function filterRulesFromQueryParams( queryParams: ParamMap ): FilterRule[] { + let filterRulesFromQueryParams: FilterRule[] = [] + const allFilterRuleQueryParams: string[] = FILTER_RULE_TYPES.map( (rt) => rt.filtervar ) @@ -104,7 +112,6 @@ export function filterRulesFromQueryParams( .filter((rt) => rt !== undefined) // transform query params to filter rules - let filterRulesFromQueryParams: FilterRule[] = [] allFilterRuleQueryParams .filter((frqp) => queryParams.has(frqp)) .forEach((filterQueryParamName) => { @@ -146,7 +153,17 @@ export function queryParamsFromFilterRules(filterRules: FilterRule[]): Params { let params = {} for (let rule of filterRules) { let ruleType = FILTER_RULE_TYPES.find((t) => t.id == rule.rule_type) - if (ruleType.isnull_filtervar && rule.value == null) { + if ( + rule.rule_type === FILTER_TITLE_CONTENT || + rule.rule_type === FILTER_SIMPLE_TEXT + ) { + params[SIMPLE_TEXT_PARAMETER] = rule.value + } else if ( + rule.rule_type === FILTER_TITLE || + rule.rule_type === FILTER_SIMPLE_TITLE + ) { + params[SIMPLE_TITLE_PARAMETER] = rule.value + } else if (ruleType.isnull_filtervar && rule.value == null) { params[ruleType.isnull_filtervar] = 1 } else if ( ruleType.isnull_filtervar && diff --git a/src/documents/filters.py b/src/documents/filters.py index 2f7de1cd4..b2b226ee1 100644 --- a/src/documents/filters.py +++ b/src/documents/filters.py @@ -3,6 +3,7 @@ from __future__ import annotations import functools import inspect import json +import logging import operator from contextlib import contextmanager from typing import TYPE_CHECKING @@ -77,6 +78,8 @@ DATETIME_KWARGS = [ CUSTOM_FIELD_QUERY_MAX_DEPTH = 10 CUSTOM_FIELD_QUERY_MAX_ATOMS = 20 +logger = logging.getLogger("paperless.api") + class CorrespondentFilterSet(FilterSet): class Meta: @@ -162,9 +165,13 @@ class InboxFilter(Filter): @extend_schema_field(serializers.CharField) class TitleContentFilter(Filter): + # Deprecated but retained for existing saved views. UI uses Tantivy-backed `text` / `title_search` params. def filter(self, qs: Any, value: Any) -> Any: value = value.strip() if isinstance(value, str) else value if value: + logger.warning( + "Deprecated document filter parameter 'title_content' used; use `text` instead.", + ) try: return qs.filter( Q(title__icontains=value) | Q(effective_content__icontains=value), @@ -243,6 +250,9 @@ class CustomFieldsFilter(Filter): def filter(self, qs, value): value = value.strip() if isinstance(value, str) else value if value: + logger.warning( + "Deprecated document filter parameter 'custom_fields__icontains' used; use `custom_field_query` or advanced Tantivy field syntax instead.", + ) fields_with_matching_selects = CustomField.objects.filter( extra_data__icontains=value, ) @@ -747,6 +757,7 @@ class DocumentFilterSet(FilterSet): is_in_inbox = InboxFilter() + # Deprecated, but keep for now for existing saved views title_content = TitleContentFilter() content__istartswith = EffectiveContentFilter(lookup_expr="istartswith") @@ -756,6 +767,7 @@ class DocumentFilterSet(FilterSet): owner__id__none = ObjectFilter(field_name="owner", exclude=True) + # Deprecated, UI no longer includes CF text-search mode, but keep for now for existing saved views custom_fields__icontains = CustomFieldsFilter() custom_fields__id__all = ObjectFilter(field_name="custom_fields__field") diff --git a/src/documents/migrations/0018_saved_view_simple_search_rules.py b/src/documents/migrations/0018_saved_view_simple_search_rules.py new file mode 100644 index 000000000..6d128c593 --- /dev/null +++ b/src/documents/migrations/0018_saved_view_simple_search_rules.py @@ -0,0 +1,92 @@ +# Generated by Django 5.2.12 on 2026-04-01 18:20 + +from django.db import migrations +from django.db import models + +OLD_TITLE_RULE = 0 +OLD_TITLE_CONTENT_RULE = 19 +NEW_SIMPLE_TITLE_RULE = 48 +NEW_SIMPLE_TEXT_RULE = 49 + + +# See documents/models.py SavedViewFilterRule +def migrate_saved_view_rules_forward(apps, schema_editor): + SavedViewFilterRule = apps.get_model("documents", "SavedViewFilterRule") + SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_RULE).update( + rule_type=NEW_SIMPLE_TITLE_RULE, + ) + SavedViewFilterRule.objects.filter(rule_type=OLD_TITLE_CONTENT_RULE).update( + rule_type=NEW_SIMPLE_TEXT_RULE, + ) + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "0017_migrate_fulltext_query_field_prefixes"), + ] + + operations = [ + migrations.AlterField( + model_name="savedviewfilterrule", + name="rule_type", + field=models.PositiveSmallIntegerField( + choices=[ + (0, "title contains"), + (1, "content contains"), + (2, "ASN is"), + (3, "correspondent is"), + (4, "document type is"), + (5, "is in inbox"), + (6, "has tag"), + (7, "has any tag"), + (8, "created before"), + (9, "created after"), + (10, "created year is"), + (11, "created month is"), + (12, "created day is"), + (13, "added before"), + (14, "added after"), + (15, "modified before"), + (16, "modified after"), + (17, "does not have tag"), + (18, "does not have ASN"), + (19, "title or content contains"), + (20, "fulltext query"), + (21, "more like this"), + (22, "has tags in"), + (23, "ASN greater than"), + (24, "ASN less than"), + (25, "storage path is"), + (26, "has correspondent in"), + (27, "does not have correspondent in"), + (28, "has document type in"), + (29, "does not have document type in"), + (30, "has storage path in"), + (31, "does not have storage path in"), + (32, "owner is"), + (33, "has owner in"), + (34, "does not have owner"), + (35, "does not have owner in"), + (36, "has custom field value"), + (37, "is shared by me"), + (38, "has custom fields"), + (39, "has custom field in"), + (40, "does not have custom field in"), + (41, "does not have custom field"), + (42, "custom fields query"), + (43, "created to"), + (44, "created from"), + (45, "added to"), + (46, "added from"), + (47, "mime type is"), + (48, "simple title search"), + (49, "simple text search"), + ], + verbose_name="rule type", + ), + ), + migrations.RunPython( + migrate_saved_view_rules_forward, + migrations.RunPython.noop, + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 96f027b94..9af5fbc23 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -623,6 +623,8 @@ class SavedViewFilterRule(models.Model): (45, _("added to")), (46, _("added from")), (47, _("mime type is")), + (48, _("simple title search")), + (49, _("simple text search")), ] saved_view = models.ForeignKey( diff --git a/src/documents/search/__init__.py b/src/documents/search/__init__.py index b0a89f242..a4145d7ef 100644 --- a/src/documents/search/__init__.py +++ b/src/documents/search/__init__.py @@ -1,4 +1,5 @@ from documents.search._backend import SearchIndexLockError +from documents.search._backend import SearchMode from documents.search._backend import SearchResults from documents.search._backend import TantivyBackend from documents.search._backend import TantivyRelevanceList @@ -10,6 +11,7 @@ from documents.search._schema import wipe_index __all__ = [ "SearchIndexLockError", + "SearchMode", "SearchResults", "TantivyBackend", "TantivyRelevanceList", diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index a1bff8a9f..405c24360 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -2,11 +2,11 @@ from __future__ import annotations import logging import threading -import unicodedata from collections import Counter from dataclasses import dataclass from datetime import UTC from datetime import datetime +from enum import StrEnum from typing import TYPE_CHECKING from typing import Self from typing import TypedDict @@ -19,7 +19,10 @@ from django.conf import settings from django.utils.timezone import get_current_timezone from guardian.shortcuts import get_users_with_perms +from documents.search._normalize import ascii_fold from documents.search._query import build_permission_filter +from documents.search._query import parse_simple_text_query +from documents.search._query import parse_simple_title_query from documents.search._query import parse_user_query from documents.search._schema import _write_sentinels from documents.search._schema import build_schema @@ -45,14 +48,10 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted T = TypeVar("T") -def _ascii_fold(s: str) -> str: - """ - Normalize unicode to ASCII equivalent characters for search consistency. - - Converts accented characters (e.g., "café") to their ASCII base forms ("cafe") - to enable cross-language searching without requiring exact diacritic matching. - """ - return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode() +class SearchMode(StrEnum): + QUERY = "query" + TEXT = "text" + TITLE = "title" def _extract_autocomplete_words(text_sources: list[str]) -> set[str]: @@ -74,7 +73,7 @@ def _extract_autocomplete_words(text_sources: list[str]) -> set[str]: ) continue for token in tokens: - normalized = _ascii_fold(token.lower()) + normalized = ascii_fold(token.lower()) if normalized: words.add(normalized) return words @@ -294,8 +293,10 @@ class TantivyBackend: doc.add_text("checksum", document.checksum) doc.add_text("title", document.title) doc.add_text("title_sort", document.title) + doc.add_text("simple_title", document.title) doc.add_text("content", content) doc.add_text("bigram_content", content) + doc.add_text("simple_content", content) # Original filename - only add if not None/empty if document.original_filename: @@ -433,6 +434,7 @@ class TantivyBackend: sort_field: str | None, *, sort_reverse: bool, + search_mode: SearchMode = SearchMode.QUERY, ) -> SearchResults: """ Execute a search query against the document index. @@ -441,20 +443,32 @@ class TantivyBackend: permission filtering before executing against Tantivy. Supports both relevance-based and field-based sorting. + QUERY search mode supports natural date keywords, field filters, etc. + TITLE search mode treats the query as plain text to search for in title only + TEXT search mode treats the query as plain text to search for in title and content + Args: - query: User's search query (supports natural date keywords, field filters) + query: User's search query user: User for permission filtering (None for superuser/no filtering) page: Page number (1-indexed) for pagination page_size: Number of results per page sort_field: Field to sort by (None for relevance ranking) sort_reverse: Whether to reverse the sort order + search_mode: "query" for advanced Tantivy syntax, "text" for + plain-text search over title and content only, "title" for + plain-text search over title only Returns: SearchResults with hits, total count, and processed query """ self._ensure_open() tz = get_current_timezone() - user_query = parse_user_query(self._index, query, tz) + if search_mode is SearchMode.TEXT: + user_query = parse_simple_text_query(self._index, query) + elif search_mode is SearchMode.TITLE: + user_query = parse_simple_title_query(self._index, query) + else: + user_query = parse_user_query(self._index, query, tz) # Apply permission filter if user is not None (not superuser) if user is not None: @@ -594,7 +608,7 @@ class TantivyBackend: List of word suggestions ordered by frequency, then alphabetically """ self._ensure_open() - normalized_term = _ascii_fold(term.lower()) + normalized_term = ascii_fold(term.lower()) searcher = self._index.searcher() diff --git a/src/documents/search/_normalize.py b/src/documents/search/_normalize.py new file mode 100644 index 000000000..3d7b23f33 --- /dev/null +++ b/src/documents/search/_normalize.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import unicodedata + + +def ascii_fold(text: str) -> str: + """Normalize unicode text to ASCII equivalents for search consistency.""" + return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode() diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 212df1516..b7bcbbe9c 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -12,6 +12,8 @@ import tantivy from dateutil.relativedelta import relativedelta from django.conf import settings +from documents.search._normalize import ascii_fold + if TYPE_CHECKING: from datetime import tzinfo @@ -51,6 +53,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile( ) # Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly _DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") +_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") def _fmt(dt: datetime) -> str: @@ -436,7 +439,37 @@ DEFAULT_SEARCH_FIELDS = [ "document_type", "tag", ] +SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"] +TITLE_SEARCH_FIELDS = ["simple_title"] _FIELD_BOOSTS = {"title": 2.0} +_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} + + +def _build_simple_field_query( + index: tantivy.Index, + field: str, + tokens: list[str], +) -> tantivy.Query: + patterns = [] + for idx, token in enumerate(tokens): + escaped = regex.escape(token) + # For multi-token substring search, only the first token can begin mid-word. + # Later tokens follow a whitespace boundary in the original query, so anchor + # them to the start of the next indexed token to reduce false positives like + # matching "Z-Berichte 16" for the query "Z-Berichte 6". + if idx == 0: + patterns.append(f".*{escaped}.*") + else: + patterns.append(f"{escaped}.*") + if len(patterns) == 1: + query = tantivy.Query.regex_query(index.schema, field, patterns[0]) + else: + query = tantivy.Query.regex_phrase_query(index.schema, field, patterns) + + boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0) + if boost > 1.0: + return tantivy.Query.boost_query(query, boost) + return query def parse_user_query( @@ -495,3 +528,52 @@ def parse_user_query( ) return exact + + +def parse_simple_query( + index: tantivy.Index, + raw_query: str, + fields: list[str], +) -> tantivy.Query: + """ + Parse a plain-text query using Tantivy over a restricted field set. + + Query string is escaped and normalized to be treated as "simple" text query. + """ + tokens = [ + ascii_fold(token.lower()) + for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) + ] + tokens = [token for token in tokens if token] + if not tokens: + return tantivy.Query.empty_query() + + field_queries = [ + (tantivy.Occur.Should, _build_simple_field_query(index, field, tokens)) + for field in fields + ] + if len(field_queries) == 1: + return field_queries[0][1] + return tantivy.Query.boolean_query(field_queries) + + +def parse_simple_text_query( + index: tantivy.Index, + raw_query: str, +) -> tantivy.Query: + """ + Parse a plain-text query over title/content for simple search inputs. + """ + + return parse_simple_query(index, raw_query, SIMPLE_SEARCH_FIELDS) + + +def parse_simple_title_query( + index: tantivy.Index, + raw_query: str, +) -> tantivy.Query: + """ + Parse a plain-text query over the title field only. + """ + + return parse_simple_query(index, raw_query, TITLE_SEARCH_FIELDS) diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index ba6646007..5e9404235 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema: # CJK support - not stored, indexed only sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer") + # Simple substring search support for title/content - not stored, indexed only + sb.add_text_field( + "simple_title", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + sb.add_text_field( + "simple_content", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + # Autocomplete prefix scan - stored, not indexed sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw") diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index e597a879e..2079ca4cc 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None: index.register_tokenizer("paperless_text", _paperless_text(language)) index.register_tokenizer("simple_analyzer", _simple_analyzer()) index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer()) # Fast-field tokenizer required for fast=True text fields in the schema index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) @@ -114,3 +115,16 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer: .filter(tantivy.Filter.lowercase()) .build() ) + + +def _simple_search_analyzer() -> tantivy.TextAnalyzer: + """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold.""" + return ( + tantivy.TextAnalyzerBuilder( + tantivy.Tokenizer.regex(r"\S+"), + ) + .filter(tantivy.Filter.remove_long(65)) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + .build() + ) diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 5c92da447..ff9638e63 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -5,6 +5,7 @@ from documents.models import CustomField from documents.models import CustomFieldInstance from documents.models import Document from documents.models import Note +from documents.search._backend import SearchMode from documents.search._backend import TantivyBackend from documents.search._backend import get_backend from documents.search._backend import reset_backend @@ -46,6 +47,258 @@ class TestWriteBatch: class TestSearch: """Test search functionality.""" + def test_text_mode_limits_default_search_to_title_and_content( + self, + backend: TantivyBackend, + ): + """Simple text mode must not match metadata-only fields.""" + doc = Document.objects.create( + title="Invoice document", + content="monthly statement", + checksum="TXT1", + pk=9, + ) + backend.add_or_update(doc) + + metadata_only = backend.search( + "document_type:invoice", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert metadata_only.total == 0 + + content_match = backend.search( + "monthly", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert content_match.total == 1 + + def test_title_mode_limits_default_search_to_title_only( + self, + backend: TantivyBackend, + ): + """Title mode must not match content-only terms.""" + doc = Document.objects.create( + title="Invoice document", + content="monthly statement", + checksum="TXT2", + pk=10, + ) + backend.add_or_update(doc) + + content_only = backend.search( + "monthly", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert content_only.total == 0 + + title_match = backend.search( + "invoice", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert title_match.total == 1 + + def test_text_mode_matches_partial_term_substrings( + self, + backend: TantivyBackend, + ): + """Simple text mode should support substring matching within tokens.""" + doc = Document.objects.create( + title="Account access", + content="password reset instructions", + checksum="TXT3", + pk=11, + ) + backend.add_or_update(doc) + + prefix_match = backend.search( + "pass", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert prefix_match.total == 1 + + infix_match = backend.search( + "sswo", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert infix_match.total == 1 + + phrase_match = backend.search( + "sswo re", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert phrase_match.total == 1 + + def test_text_mode_does_not_match_on_partial_term_overlap( + self, + backend: TantivyBackend, + ): + """Simple text mode should not match documents that merely share partial fragments.""" + doc = Document.objects.create( + title="Adobe Acrobat PDF Files", + content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + checksum="TXT7", + pk=13, + ) + backend.add_or_update(doc) + + non_match = backend.search( + "raptor", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert non_match.total == 0 + + def test_text_mode_anchors_later_query_tokens_to_token_starts( + self, + backend: TantivyBackend, + ): + """Multi-token simple search should not match later tokens in the middle of a word.""" + exact_doc = Document.objects.create( + title="Z-Berichte 6", + content="monthly report", + checksum="TXT9", + pk=15, + ) + prefix_doc = Document.objects.create( + title="Z-Berichte 60", + content="monthly report", + checksum="TXT10", + pk=16, + ) + false_positive = Document.objects.create( + title="Z-Berichte 16", + content="monthly report", + checksum="TXT11", + pk=17, + ) + backend.add_or_update(exact_doc) + backend.add_or_update(prefix_doc) + backend.add_or_update(false_positive) + + results = backend.search( + "Z-Berichte 6", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + result_ids = {hit["id"] for hit in results.hits} + + assert exact_doc.id in result_ids + assert prefix_doc.id in result_ids + assert false_positive.id not in result_ids + + def test_text_mode_ignores_queries_without_searchable_tokens( + self, + backend: TantivyBackend, + ): + """Simple text mode should safely return no hits for symbol-only strings.""" + doc = Document.objects.create( + title="Guide", + content="This is a guide.", + checksum="TXT8", + pk=14, + ) + backend.add_or_update(doc) + + no_tokens = backend.search( + "!!!", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert no_tokens.total == 0 + + def test_title_mode_matches_partial_term_substrings( + self, + backend: TantivyBackend, + ): + """Title mode should support substring matching within title tokens.""" + doc = Document.objects.create( + title="Password guide", + content="reset instructions", + checksum="TXT4", + pk=12, + ) + backend.add_or_update(doc) + + prefix_match = backend.search( + "pass", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert prefix_match.total == 1 + + infix_match = backend.search( + "sswo", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert infix_match.total == 1 + + phrase_match = backend.search( + "sswo gu", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert phrase_match.total == 1 + def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend): """Search scores must be normalized so top hit has score 1.0 for UI consistency.""" for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]): diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py index aee52a567..fc2c41231 100644 --- a/src/documents/tests/search/test_tokenizer.py +++ b/src/documents/tests/search/test_tokenizer.py @@ -8,6 +8,7 @@ import tantivy from documents.search._tokenizer import _bigram_analyzer from documents.search._tokenizer import _paperless_text +from documents.search._tokenizer import _simple_search_analyzer from documents.search._tokenizer import register_tokenizers if TYPE_CHECKING: @@ -41,6 +42,20 @@ class TestTokenizers: idx.register_tokenizer("bigram_analyzer", _bigram_analyzer()) return idx + @pytest.fixture + def simple_search_index(self) -> tantivy.Index: + """Index with simple-search field for Latin substring tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field( + "simple_content", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer()) + return idx + def test_ascii_fold_finds_accented_content( self, content_index: tantivy.Index, @@ -66,6 +81,24 @@ class TestTokenizers: q = bigram_index.parse_query("東京", ["bigram_content"]) assert bigram_index.searcher().search(q, limit=5).count == 1 + def test_simple_search_analyzer_supports_regex_substrings( + self, + simple_search_index: tantivy.Index, + ) -> None: + """Whitespace-preserving simple search analyzer supports substring regex matching.""" + writer = simple_search_index.writer() + doc = tantivy.Document() + doc.add_text("simple_content", "tag:invoice password-reset") + writer.add_document(doc) + writer.commit() + simple_search_index.reload() + q = tantivy.Query.regex_query( + simple_search_index.schema, + "simple_content", + ".*sswo.*", + ) + assert simple_search_index.searcher().search(q, limit=5).count == 1 + def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: """Unsupported language codes should log a warning and disable stemming gracefully.""" sb = tantivy.SchemaBuilder() diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 69bd65198..9e0879e89 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -91,6 +91,135 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): self.assertEqual(response.data["count"], 0) self.assertEqual(len(results), 0) + def test_simple_text_search(self) -> None: + tagged = Tag.objects.create(name="invoice") + matching_doc = Document.objects.create( + title="Quarterly summary", + content="Monthly bank report", + checksum="T1", + pk=11, + ) + matching_doc.tags.add(tagged) + + metadata_only_doc = Document.objects.create( + title="Completely unrelated", + content="No matching terms here", + checksum="T2", + pk=12, + ) + metadata_only_doc.tags.add(tagged) + + backend = get_backend() + backend.add_or_update(matching_doc) + backend.add_or_update(metadata_only_doc) + + response = self.client.get("/api/documents/?text=monthly") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + response = self.client.get("/api/documents/?text=tag:invoice") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 0) + + def test_simple_text_search_matches_substrings(self) -> None: + matching_doc = Document.objects.create( + title="Quarterly summary", + content="Password reset instructions", + checksum="T5", + pk=15, + ) + + backend = get_backend() + backend.add_or_update(matching_doc) + + response = self.client.get("/api/documents/?text=pass") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + response = self.client.get("/api/documents/?text=sswo") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + response = self.client.get("/api/documents/?text=sswo re") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None: + non_matching_doc = Document.objects.create( + title="Adobe Acrobat PDF Files", + content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + checksum="T7", + pk=17, + ) + + backend = get_backend() + backend.add_or_update(non_matching_doc) + + response = self.client.get("/api/documents/?text=raptor") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 0) + + def test_simple_title_search(self) -> None: + title_match = Document.objects.create( + title="Quarterly summary", + content="No matching content here", + checksum="T3", + pk=13, + ) + content_only = Document.objects.create( + title="Completely unrelated", + content="Quarterly summary appears only in content", + checksum="T4", + pk=14, + ) + + backend = get_backend() + backend.add_or_update(title_match) + backend.add_or_update(content_only) + + response = self.client.get("/api/documents/?title_search=quarterly") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + def test_simple_title_search_matches_substrings(self) -> None: + title_match = Document.objects.create( + title="Password handbook", + content="No matching content here", + checksum="T6", + pk=16, + ) + + backend = get_backend() + backend.add_or_update(title_match) + + response = self.client.get("/api/documents/?title_search=pass") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + response = self.client.get("/api/documents/?title_search=sswo") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + response = self.client.get("/api/documents/?title_search=sswo hand") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + def test_search_rejects_multiple_search_modes(self) -> None: + response = self.client.get("/api/documents/?text=bank&query=bank") + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual( + response.data["detail"], + "Specify only one of text, title_search, query, or more_like_id.", + ) + def test_search_returns_all_for_api_version_9(self) -> None: d1 = Document.objects.create( title="invoice", @@ -1493,6 +1622,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id) self.assertEqual(results["workflows"][0]["id"], workflow1.id) + def test_global_search_db_only_limits_documents_to_title_matches(self) -> None: + title_match = Document.objects.create( + title="bank statement", + content="no additional terms", + checksum="GS1", + pk=21, + ) + content_only = Document.objects.create( + title="not a title match", + content="bank appears only in content", + checksum="GS2", + pk=22, + ) + + backend = get_backend() + backend.add_or_update(title_match) + backend.add_or_update(content_only) + + self.client.force_authenticate(self.user) + + response = self.client.get("/api/search/?query=bank&db_only=true") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["documents"]), 1) + self.assertEqual(response.data["documents"][0]["id"], title_match.id) + def test_global_search_filters_owned_mail_objects(self) -> None: user1 = User.objects.create_user("mail-search-user") user2 = User.objects.create_user("other-mail-search-user") diff --git a/src/documents/views.py b/src/documents/views.py index 024e846a0..68d2b7961 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1995,11 +1995,23 @@ class ChatStreamingView(GenericAPIView): list=extend_schema( description="Document views including search", parameters=[ + OpenApiParameter( + name="text", + type=OpenApiTypes.STR, + location=OpenApiParameter.QUERY, + description="Simple Tantivy-backed text search query string", + ), + OpenApiParameter( + name="title_search", + type=OpenApiTypes.STR, + location=OpenApiParameter.QUERY, + description="Simple Tantivy-backed title-only search query string", + ), OpenApiParameter( name="query", type=OpenApiTypes.STR, location=OpenApiParameter.QUERY, - description="Advanced search query string", + description="Advanced Tantivy search query string", ), OpenApiParameter( name="full_perms", @@ -2025,22 +2037,28 @@ class ChatStreamingView(GenericAPIView): ), ) class UnifiedSearchViewSet(DocumentViewSet): + SEARCH_PARAM_NAMES = ("text", "title_search", "query", "more_like_id") + def get_serializer_class(self): if self._is_search_request(): return SearchResultSerializer else: return DocumentSerializer + def _get_active_search_params(self, request: Request | None = None) -> list[str]: + request = request or self.request + return [ + param for param in self.SEARCH_PARAM_NAMES if param in request.query_params + ] + def _is_search_request(self): - return ( - "query" in self.request.query_params - or "more_like_id" in self.request.query_params - ) + return bool(self._get_active_search_params()) def list(self, request, *args, **kwargs): if not self._is_search_request(): return super().list(request) + from documents.search import SearchMode from documents.search import TantivyRelevanceList from documents.search import get_backend @@ -2050,9 +2068,31 @@ class UnifiedSearchViewSet(DocumentViewSet): filtered_qs = self.filter_queryset(self.get_queryset()) user = None if request.user.is_superuser else request.user + active_search_params = self._get_active_search_params(request) - if "query" in request.query_params: - query_str = request.query_params["query"] + if len(active_search_params) > 1: + raise ValidationError( + { + "detail": _( + "Specify only one of text, title_search, query, or more_like_id.", + ), + }, + ) + + if ( + "text" in request.query_params + or "title_search" in request.query_params + or "query" in request.query_params + ): + if "text" in request.query_params: + search_mode = SearchMode.TEXT + query_str = request.query_params["text"] + elif "title_search" in request.query_params: + search_mode = SearchMode.TITLE + query_str = request.query_params["title_search"] + else: + search_mode = SearchMode.QUERY + query_str = request.query_params["query"] results = backend.search( query_str, user=user, @@ -2060,6 +2100,7 @@ class UnifiedSearchViewSet(DocumentViewSet): page_size=10000, sort_field=None, sort_reverse=False, + search_mode=search_mode, ) else: # more_like_id — validate permission on the seed document first @@ -2132,6 +2173,8 @@ class UnifiedSearchViewSet(DocumentViewSet): if str(e.detail) == str(invalid_more_like_id_message): return HttpResponseForbidden(invalid_more_like_id_message) return HttpResponseForbidden(_("Insufficient permissions.")) + except ValidationError: + raise except Exception as e: logger.warning(f"An error occurred listing search results: {e!s}") return HttpResponseBadRequest( @@ -3003,6 +3046,9 @@ class GlobalSearchView(PassUserMixin): serializer_class = SearchResultSerializer def get(self, request, *args, **kwargs): + from documents.search import SearchMode + from documents.search import get_backend + query = request.query_params.get("query", None) if query is None: return HttpResponseBadRequest("Query required") @@ -3019,25 +3065,25 @@ class GlobalSearchView(PassUserMixin): "view_document", Document, ) - # First search by title - docs = all_docs.filter(title__icontains=query) - if not db_only and len(docs) < OBJECT_LIMIT: - # If we don't have enough results, search by content. - # Over-fetch from Tantivy (no permission filter) and rely on - # the ORM all_docs queryset for authoritative permission gating. - from documents.search import get_backend - + if db_only: + docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT] + else: + user = None if request.user.is_superuser else request.user fts_results = get_backend().search( query, - user=None, + user=user, page=1, page_size=1000, sort_field=None, sort_reverse=False, + search_mode=SearchMode.TEXT, ) - fts_ids = {h["id"] for h in fts_results.hits} - docs = docs | all_docs.filter(id__in=fts_ids) - docs = docs[:OBJECT_LIMIT] + docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits]) + docs = [ + docs_by_id[hit["id"]] + for hit in fts_results.hits + if hit["id"] in docs_by_id + ][:OBJECT_LIMIT] saved_views = ( get_objects_for_user_owner_aware( request.user,