Compare commits

..

1 Commits

Author SHA1 Message Date
stumpylog 3a891b38a8 Drops the search shims 2026-06-15 19:22:59 -07:00
39 changed files with 96 additions and 1641 deletions
-7
View File
@@ -2068,13 +2068,6 @@ context by default.
Defaults to 8192.
#### [`PAPERLESS_AI_LLM_REQUEST_TIMEOUT=<int>`](#PAPERLESS_AI_LLM_REQUEST_TIMEOUT) {#PAPERLESS_AI_LLM_REQUEST_TIMEOUT}
: The timeout, in seconds, for requests to the configured AI backend. Increase this when using
local or slow inference servers that need more time to generate responses.
Defaults to 120.
#### [`PAPERLESS_AI_LLM_BACKEND=<str>`](#PAPERLESS_AI_LLM_BACKEND) {#PAPERLESS_AI_LLM_BACKEND}
: The AI backend to use. This can be either "openai-like" or "ollama". If set to "ollama", the AI
+1 -1
View File
@@ -26,7 +26,7 @@ module.exports = {
'abstract-paperless-service',
],
transformIgnorePatterns: [
'node_modules/(?!.*(\\.mjs$|tslib|lodash-es|normalize-diacritics|@angular/common/locales/.*\\.js$))',
'node_modules/(?!.*(\\.mjs$|tslib|lodash-es|@angular/common/locales/.*\\.js$))',
],
moduleNameMapper: {
...esmPreset.moduleNameMapper,
-1
View File
@@ -32,7 +32,6 @@
"ngx-cookie-service": "^21.3.1",
"ngx-device-detector": "^11.0.0",
"ngx-ui-tour-ng-bootstrap": "^18.0.0",
"normalize-diacritics": "^5.0.0",
"pdfjs-dist": "^5.7.284",
"rxjs": "^7.8.2",
"tslib": "^2.8.1",
-11
View File
@@ -71,9 +71,6 @@ importers:
ngx-ui-tour-ng-bootstrap:
specifier: ^18.0.0
version: 18.0.0(f910a33494d223bd6dd07ce1bf22a35e)
normalize-diacritics:
specifier: ^5.0.0
version: 5.0.0
pdfjs-dist:
specifier: ^5.7.284
version: 5.7.284
@@ -5519,10 +5516,6 @@ packages:
engines: {node: ^20.17.0 || >=22.9.0}
hasBin: true
normalize-diacritics@5.0.0:
resolution: {integrity: sha512-t6czCJOpbAtckN1wCC2qPWnO3GQvNANb9bcUNbiOLEqojVuP31+ELIs5KhEG8jyz0TH7iD9BWxWz8O3ic2/rMQ==}
engines: {node: '>= 14.x', npm: '>= 6.x'}
normalize-path@3.0.0:
resolution: {integrity: sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==}
engines: {node: '>=0.10.0'}
@@ -12938,10 +12931,6 @@ snapshots:
dependencies:
abbrev: 4.0.0
normalize-diacritics@5.0.0:
dependencies:
tslib: 2.8.1
normalize-path@3.0.0: {}
npm-bundled@5.0.0:
@@ -23,7 +23,6 @@ import {
import { CustomFieldsService } from 'src/app/services/rest/custom-fields.service'
import { ToastService } from 'src/app/services/toast.service'
import { pngxPopperOptions } from 'src/app/utils/popper-options'
import { matchesSearchText } from 'src/app/utils/text-search'
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
import { CustomFieldEditDialogComponent } from '../edit-dialog/custom-field-edit-dialog/custom-field-edit-dialog.component'
@@ -70,7 +69,9 @@ export class CustomFieldsDropdownComponent extends LoadingComponentWithPermissio
public get filteredFields(): CustomField[] {
return this.unusedFields.filter(
(f) => !this.filterText || matchesSearchText(f.name, this.filterText)
(f) =>
!this.filterText ||
f.name.toLowerCase().includes(this.filterText.toLowerCase())
)
}
@@ -63,7 +63,6 @@
[(ngModel)]="atom.value"
[disabled]="disabled"
[virtualScroll]="getSelectOptionsForField(atom.field)?.length > 100"
[searchFn]="selectOptionSearchFn"
(mousedown)="$event.stopImmediatePropagation()"
></ng-select>
} @else if (getCustomFieldByID(atom.field)?.data_type === CustomFieldDataType.DocumentLink) {
@@ -82,7 +81,6 @@
[disabled]="disabled"
bindLabel="name"
bindValue="id"
[searchFn]="customFieldSearchFn"
(mousedown)="$event.stopImmediatePropagation()"
></ng-select>
<select class="w-25 form-select" [(ngModel)]="atom.operator" [disabled]="disabled">
@@ -127,7 +125,6 @@
[(ngModel)]="atom.value"
[disabled]="disabled"
[multiple]="true"
[searchFn]="selectOptionSearchFn"
(mousedown)="$event.stopImmediatePropagation()"
></ng-select>
}
@@ -36,7 +36,6 @@ import {
CustomFieldQueryExpression,
} from 'src/app/utils/custom-field-query-element'
import { pngxPopperOptions } from 'src/app/utils/popper-options'
import { matchesSearchText } from 'src/app/utils/text-search'
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
import { ClearableBadgeComponent } from '../clearable-badge/clearable-badge.component'
import { DocumentLinkComponent } from '../input/document-link/document-link.component'
@@ -282,14 +281,6 @@ export class CustomFieldsQueryDropdownComponent extends LoadingComponentWithPerm
public readonly today: string = new Date().toLocaleDateString('en-CA')
public customFieldSearchFn = (term: string, field: CustomField): boolean =>
matchesSearchText(field?.name, term)
public selectOptionSearchFn = (
term: string,
option: { id: string; label: string }
): boolean => matchesSearchText(option?.label, term)
constructor() {
super()
this.selectionModel = new CustomFieldQueriesModel()
@@ -28,7 +28,6 @@
[notFoundText]="notFoundText"
[multiple]="multiple"
[bindLabel]="bindLabel"
[searchFn]="searchFn"
bindValue="id"
[virtualScroll]="items?.length > 100"
(change)="onChange(value)"
@@ -112,15 +112,6 @@ describe('SelectComponent', () => {
expect(createNewVal).toEqual('baz')
})
it('should search items by independent normalized terms', () => {
expect(
component.searchFn('tax 26', { id: 11, name: 'Tax\u00e9s 2026' })
).toBeTruthy()
expect(
component.searchFn('tax receipt', { id: 11, name: 'Tax\u00e9s 2026' })
).toBeFalsy()
})
it('should clear search term on blur after delay', fakeAsync(() => {
const clearSpy = jest.spyOn(component, 'clearLastSearchTerm')
component.onBlur()
@@ -13,7 +13,6 @@ import {
import { RouterModule } from '@angular/router'
import { NgSelectModule } from '@ng-select/ng-select'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { matchesSearchText } from 'src/app/utils/text-search'
import { AbstractInputComponent } from '../abstract-input'
@Component({
@@ -100,9 +99,6 @@ export class SelectComponent extends AbstractInputComponent<number> {
@Input()
bindLabel: string = 'name'
public searchFn = (term: string, item: any): boolean =>
matchesSearchText(item?.[this.bindLabel], term)
@Input()
showFilter: boolean = false
@@ -14,7 +14,6 @@
[clearSearchOnAdd]="true"
[hideSelected]="tags.length > 0"
[addTag]="allowCreate ? createTagRef : false"
[searchFn]="searchFn"
addTagText="Add tag"
i18n-addTagText
(add)="onAdd($event)"
@@ -171,15 +171,6 @@ describe('TagsComponent', () => {
expect(component.getTag(4)).toBeUndefined()
})
it('should search tags by independent normalized terms including parents', () => {
const parent: Tag = { id: 11, name: 'Financ\u00e9' }
const child: Tag = { id: 12, name: 'Taxes 2026', parent: parent.id }
component.tags = [parent, child]
expect(component.searchFn('finance 26', child)).toBeTruthy()
expect(component.searchFn('finance receipt', child)).toBeFalsy()
})
it('should emit filtered documents', () => {
component.value = [10]
component.tags = tags
@@ -21,7 +21,6 @@ import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { first, firstValueFrom, tap } from 'rxjs'
import { Tag } from 'src/app/data/tag'
import { TagService } from 'src/app/services/rest/tag.service'
import { matchesSearchText } from 'src/app/utils/text-search'
import { EditDialogMode } from '../../edit-dialog/edit-dialog.component'
import { TagEditDialogComponent } from '../../edit-dialog/tag-edit-dialog/tag-edit-dialog.component'
import { TagComponent } from '../../tag/tag.component'
@@ -115,14 +114,6 @@ export class TagsComponent implements OnInit, ControlValueAccessor {
public createTagRef: (name) => void
public searchFn = (term: string, tag: Tag): boolean =>
matchesSearchText(
[this.getParentChain(tag?.id).map((parent) => parent.name), tag?.name]
.flat()
.join(' '),
term
)
getTag(id: number) {
if (this.tags) {
return this.tags.find((tag) => tag.id == id)
-9
View File
@@ -360,14 +360,6 @@ export const PaperlessConfigOptions: ConfigOption[] = [
category: ConfigCategory.AI,
note: $localize`Language to use for generated AI suggestions. When unset, AI suggestions use the user's display language if explicitly set.`,
},
{
key: 'llm_request_timeout',
title: $localize`LLM Request Timeout`,
type: ConfigOptionType.Number,
config_key: 'PAPERLESS_AI_LLM_REQUEST_TIMEOUT',
category: ConfigCategory.AI,
note: $localize`Timeout in seconds for LLM requests.`,
},
]
export interface PaperlessConfig extends ObjectWithId {
@@ -409,5 +401,4 @@ export interface PaperlessConfig extends ObjectWithId {
llm_api_key: string
llm_endpoint: string
llm_output_language: string
llm_request_timeout: number
}
+3 -2
View File
@@ -1,6 +1,5 @@
import { Pipe, PipeTransform } from '@angular/core'
import { MatchingModel } from '../data/matching-model'
import { matchesSearchText } from '../utils/text-search'
@Pipe({
name: 'filter',
@@ -22,7 +21,9 @@ export class FilterPipe implements PipeTransform {
typeof item[key] === 'string' || typeof item[key] === 'number'
)
return keys.some((key) => {
return matchesSearchText(item[key], searchText)
return String(item[key])
.toLowerCase()
.includes(searchText.toLowerCase())
})
})
}
-17
View File
@@ -1,17 +0,0 @@
import { matchesSearchText } from './text-search'
describe('text search utilities', () => {
it('matches text accent-insensitively', () => {
expect(matchesSearchText('R\u00e9sum\u00e9', 'resume')).toBeTruthy()
expect(matchesSearchText('S\u00f8ren', 'soren')).toBeTruthy()
expect(matchesSearchText('\u0152uvre', 'oeuvre')).toBeTruthy()
expect(matchesSearchText('Invoice', 'receipt')).toBeFalsy()
})
it('matches all whitespace-separated search terms independently', () => {
expect(matchesSearchText('taxes 2026', 'tax 26')).toBeTruthy()
expect(matchesSearchText('2026 taxes', 'tax 26')).toBeTruthy()
expect(matchesSearchText('Tax\u00e9s 2026', 'taxe 26')).toBeTruthy()
expect(matchesSearchText('taxes 2026', 'tax receipt')).toBeFalsy()
})
})
-23
View File
@@ -1,23 +0,0 @@
import { normalizeSync } from 'normalize-diacritics'
export type SearchTextValue =
| string
| number
| boolean
| bigint
| null
| undefined
export function normalizeSearchText(value: SearchTextValue): string {
return normalizeSync(String(value ?? '')).toLocaleLowerCase()
}
export function matchesSearchText(
value: SearchTextValue,
searchText: SearchTextValue
): boolean {
const normalizedValue = normalizeSearchText(value)
const searchTerms = normalizeSearchText(searchText).trim().split(/\s+/)
return searchTerms.every((term) => normalizedValue.includes(term))
}
@@ -169,10 +169,6 @@ class FileStabilityTracker:
self._tracked.pop(path, None)
yield path
def is_tracking(self, path: Path) -> bool:
"""Check whether a path is currently being tracked for stability."""
return path.resolve() in self._tracked
def has_pending_files(self) -> bool:
"""Check if there are files waiting for stability check."""
return len(self._tracked) > 0
@@ -374,16 +370,6 @@ class Command(BaseCommand):
# Testing timeout in seconds
testing_timeout_s: Final[float] = 0.5
# How often to perform a full-glob rescan of the consume directory as a
# safety net. Each watchfiles watcher is torn down and recreated on every
# batch to reconfigure its timeout, and a fresh watcher silently adopts the
# current directory contents as its baseline. A file that appears between
# one batch and the next watcher's baseline is therefore never reported and
# would sit in the consume directory forever. This periodic rescan re-injects
# such files into the stability tracker (see GH issue #13011). Not currently
# user-configurable; instances may override for testing.
rescan_interval_s: float = 300.0
def add_arguments(self, parser) -> None:
parser.add_argument(
"directory",
@@ -439,7 +425,7 @@ class Command(BaseCommand):
)
# Process existing files
queued = self._process_existing_files(
self._process_existing_files(
directory=directory,
recursive=recursive,
subdirs_as_tags=subdirs_as_tags,
@@ -459,7 +445,6 @@ class Command(BaseCommand):
polling_interval=polling_interval,
stability_delay=stability_delay,
is_testing=is_testing,
queued=queued,
)
logger.debug("Consumer exiting")
@@ -471,18 +456,11 @@ class Command(BaseCommand):
recursive: bool,
subdirs_as_tags: bool,
consumer_filter: ConsumerFilter,
) -> set[Path]:
"""
Process any existing files in the consumption directory.
Returns the set of resolved paths that were queued, so the watch loop
can seed its in-flight set and avoid re-queuing them on the first
rescan before the consume tasks have removed them from disk.
"""
) -> None:
"""Process any existing files in the consumption directory."""
logger.info(f"Processing existing files in {directory}")
glob_pattern = "**/*" if recursive else "*"
queued: set[Path] = set()
for filepath in directory.glob(glob_pattern):
# Use filter to check if file should be processed
@@ -497,48 +475,6 @@ class Command(BaseCommand):
consumption_dir=directory,
subdirs_as_tags=subdirs_as_tags,
)
queued.add(filepath.resolve())
return queued
def _rescan_existing_files(
self,
*,
directory: Path,
recursive: bool,
consumer_filter: ConsumerFilter,
tracker: FileStabilityTracker,
queued: set[Path],
) -> None:
"""
Re-inject on-disk files the watcher never reported into the tracker.
Acts as a safety net for files stranded by the watcher-recreation gap
(see ``rescan_interval_s``). Files already being tracked or already
queued and awaiting consumption are skipped, so a file is never queued
twice. Queued paths that have since left the directory are pruned so a
later file reusing the same name is not skipped forever.
"""
# Prune in-flight paths that have left the directory
for path in list(queued):
if not path.exists():
queued.discard(path)
glob_pattern = "**/*" if recursive else "*"
for filepath in directory.glob(glob_pattern):
if not filepath.is_file():
continue
if not consumer_filter(Change.added, str(filepath)):
continue
resolved = filepath.resolve()
if tracker.is_tracking(resolved) or resolved in queued:
continue
logger.debug(f"Rescan found untracked file: {resolved}")
tracker.track(resolved, Change.added)
def _watch_directory(
self,
@@ -550,24 +486,11 @@ class Command(BaseCommand):
polling_interval: float,
stability_delay: float,
is_testing: bool,
queued: set[Path] | None = None,
) -> None:
"""Watch directory for changes and process stable files."""
use_polling = polling_interval > 0
poll_delay_ms = int(polling_interval * 1000) if use_polling else 0
# Resolved paths that have been queued and are awaiting consumption.
# Seeded from the startup scan so the first rescan does not re-queue
# files whose consume tasks have not yet removed them from disk.
queued = set() if queued is None else queued
# Full-glob safety net cadence (0 disables)
rescan_interval_s = self.rescan_interval_s
rescan_timeout_ms = (
int(rescan_interval_s * 1000) if rescan_interval_s > 0 else 0
)
last_rescan = monotonic()
if use_polling:
logger.info(
f"Watching {directory} using polling (interval: {polling_interval}s)",
@@ -582,20 +505,6 @@ class Command(BaseCommand):
stability_timeout_ms = int(stability_delay * 1000)
testing_timeout_ms = int(self.testing_timeout_s * 1000)
def cap_for_rescan(ms: int) -> int:
"""
Ensure the watch loop wakes often enough to run the rescan.
``watch()`` blocks for up to ``rust_timeout``, so the rescan can
only run that often. A timeout of 0 means "wait indefinitely",
which would never wake to rescan; cap it at the rescan interval.
"""
if rescan_timeout_ms <= 0:
return ms
if ms <= 0:
return rescan_timeout_ms
return min(ms, rescan_timeout_ms)
# Calculate appropriate timeout for watch loop
# In polling mode, rust_timeout must be significantly longer than poll_delay_ms
# to ensure poll cycles can complete before timing out
@@ -613,8 +522,6 @@ class Command(BaseCommand):
# Not testing, wait indefinitely for first event
timeout_ms = 0
timeout_ms = cap_for_rescan(timeout_ms)
self.stop_flag.clear()
while not self.stop_flag.is_set():
@@ -644,26 +551,10 @@ class Command(BaseCommand):
consumption_dir=directory,
subdirs_as_tags=subdirs_as_tags,
)
# Remember it so the rescan does not re-queue it while
# the consume task has yet to remove it from disk
queued.add(stable_path)
# Exit watch loop to reconfigure timeout
break
# Periodic full-glob safety net for files the watcher missed
if rescan_timeout_ms > 0 and (
monotonic() - last_rescan >= rescan_interval_s
):
self._rescan_existing_files(
directory=directory,
recursive=recursive,
consumer_filter=consumer_filter,
tracker=tracker,
queued=queued,
)
last_rescan = monotonic()
# Determine next timeout
if tracker.has_pending_files():
# Check pending files at stability interval
@@ -681,8 +572,6 @@ class Command(BaseCommand):
# No pending files, wait indefinitely
timeout_ms = 0
timeout_ms = cap_for_rescan(timeout_ms)
except KeyboardInterrupt: # pragma: nocover
logger.info("Received interrupt, stopping consumer")
self.stop_flag.set()
@@ -1,63 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-04 15:31
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "0003_remove_document_storage_type"),
("documents", "0004_workflowtrigger_filter_has_any_correspondents_and_more"),
("documents", "0005_alter_document_checksum_unique"),
]
dependencies = [
("documents", "0002_squashed"),
]
operations = [
migrations.RemoveField(
model_name="document",
name="storage_type",
),
migrations.AddField(
model_name="workflowtrigger",
name="filter_has_any_correspondents",
field=models.ManyToManyField(
blank=True,
related_name="workflowtriggers_has_any_correspondent",
to="documents.correspondent",
verbose_name="has one of these correspondents",
),
),
migrations.AddField(
model_name="workflowtrigger",
name="filter_has_any_document_types",
field=models.ManyToManyField(
blank=True,
related_name="workflowtriggers_has_any_document_type",
to="documents.documenttype",
verbose_name="has one of these document types",
),
),
migrations.AddField(
model_name="workflowtrigger",
name="filter_has_any_storage_paths",
field=models.ManyToManyField(
blank=True,
related_name="workflowtriggers_has_any_storage_path",
to="documents.storagepath",
verbose_name="has one of these storage paths",
),
),
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
verbose_name="checksum",
),
),
]
@@ -1,252 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-04 15:31
import django.db.models.deletion
import django.db.models.functions.text
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("documents", "0008_workflowaction_passwords_alter_workflowaction_type"),
("documents", "0009_alter_document_content_length"),
("documents", "0010_optimize_integer_field_sizes"),
("documents", "0011_alter_workflowaction_type"),
("documents", "0012_document_root_document"),
]
dependencies = [
("documents", "0007_sharelinkbundle"),
]
operations = [
migrations.AddField(
model_name="workflowaction",
name="passwords",
field=models.JSONField(
blank=True,
help_text="Passwords to try when removing PDF protection. Separate with commas or new lines.",
null=True,
verbose_name="passwords",
),
),
migrations.AlterField(
model_name="document",
name="content_length",
field=models.GeneratedField(
db_persist=True,
expression=django.db.models.functions.text.Length("content"),
help_text="Length of the content field in characters. Automatically maintained by the database for faster statistics computation.",
output_field=models.PositiveIntegerField(default=0),
serialize=False,
),
),
migrations.AlterField(
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveSmallIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="documenttype",
name="matching_algorithm",
field=models.PositiveSmallIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveSmallIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
(26, "has correspondent in"),
(27, "does not have correspondent in"),
(28, "has document type in"),
(29, "does not have document type in"),
(30, "has storage path in"),
(31, "does not have storage path in"),
(32, "owner is"),
(33, "has owner in"),
(34, "does not have owner"),
(35, "does not have owner in"),
(36, "has custom field value"),
(37, "is shared by me"),
(38, "has custom fields"),
(39, "has custom field in"),
(40, "does not have custom field in"),
(41, "does not have custom field"),
(42, "custom fields query"),
(43, "created to"),
(44, "created from"),
(45, "added to"),
(46, "added from"),
(47, "mime type is"),
],
verbose_name="rule type",
),
),
migrations.AlterField(
model_name="storagepath",
name="matching_algorithm",
field=models.PositiveSmallIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="tag",
name="matching_algorithm",
field=models.PositiveSmallIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="workflowrun",
name="type",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Consumption Started"),
(2, "Document Added"),
(3, "Document Updated"),
(4, "Scheduled"),
],
null=True,
verbose_name="workflow trigger type",
),
),
migrations.AlterField(
model_name="workflowtrigger",
name="matching_algorithm",
field=models.PositiveSmallIntegerField(
choices=[
(0, "None"),
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
],
default=0,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name="workflowtrigger",
name="type",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Consumption Started"),
(2, "Document Added"),
(3, "Document Updated"),
(4, "Scheduled"),
],
default=1,
verbose_name="Workflow Trigger Type",
),
),
migrations.AlterField(
model_name="workflowaction",
name="type",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Assignment"),
(2, "Removal"),
(3, "Email"),
(4, "Webhook"),
(5, "Password removal"),
(6, "Move to trash"),
],
default=1,
verbose_name="Workflow Action Type",
),
),
migrations.AddField(
model_name="document",
name="root_document",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="versions",
to="documents.document",
verbose_name="root document for this version",
),
),
migrations.AddField(
model_name="document",
name="version_label",
field=models.CharField(
blank=True,
help_text="Optional short label for a document version.",
max_length=64,
null=True,
verbose_name="version label",
),
),
]
+2 -18
View File
@@ -866,24 +866,8 @@ class TantivyBackend:
final_query = self._apply_permission_filter(mlt_query, user)
effective_limit = limit if limit is not None else searcher.num_docs
try:
# Fetch one extra to account for excluding the original document
results = searcher.search(final_query, limit=effective_limit + 1)
except BaseException: # pragma: no cover
# Tantivy 0.26 panics in BM25 idf scoring when the index holds
# soft-deleted documents (doc_freq can exceed the alive doc count),
# which only surfaces for the More Like This query. The panic crosses
# the pyo3 boundary as a `pyo3_runtime.PanicException` — a
# BaseException, not an Exception — so catch BaseException and degrade
# to "no similar documents" instead of bubbling a 500 to the client.
# Fixed upstream: https://github.com/quickwit-oss/tantivy/pull/2964
# Remove once the bundled tantivy includes that fix.
logger.warning(
"More Like This scoring panicked (likely stale tantivy segment "
"stats after deletions); returning no results. A search index "
"reindex will rebuild consistent statistics.",
)
return []
# Fetch one extra to account for excluding the original document
results = searcher.search(final_query, limit=effective_limit + 1)
addrs = [addr for _score, addr in results.hits]
all_ids = cast("list[int]", searcher.fast_field_values("id", addrs))
-43
View File
@@ -1,7 +1,6 @@
from __future__ import annotations
import logging
from datetime import UTC
from typing import TYPE_CHECKING
from typing import Final
@@ -9,12 +8,6 @@ import regex
import tantivy
from django.conf import settings
from documents.search._dates import (
_date_only_range, # noqa: F401 — re-exported for test imports
)
from documents.search._dates import (
_datetime_range, # noqa: F401 — re-exported for test imports
)
from documents.search._tokenizer import simple_search_tokens
from documents.search._translate import SearchQueryError
from documents.search._translate import translate_query
@@ -64,42 +57,6 @@ def _build_cjk_query(
return None
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
"""
Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
Delegates to ``translate_query`` which handles all date forms, comma
expansion, field aliasing, relative ranges, and operator normalization.
Args:
query: Raw user query string
tz: Timezone for converting local date boundaries to UTC
Returns:
Query with date syntax rewritten to ISO 8601 ranges
Note:
Bare keywords without field prefixes pass through unchanged.
"""
return translate_query(query, tz)
def normalize_query(query: str) -> str:
"""
Normalize query syntax for better search behavior.
Delegates to ``translate_query`` which handles comma expansion, whitespace
collapsing, operator normalization, and field aliasing.
Args:
query: Query string after date rewriting
Returns:
Normalized query string ready for Tantivy parsing
"""
return translate_query(query, UTC)
def build_permission_filter(
schema: tantivy.Schema,
user: AbstractBaseUser,
+49 -48
View File
@@ -11,16 +11,15 @@ import pytest
import tantivy
import time_machine
from documents.search._query import _date_only_range
from documents.search._query import _datetime_range
from documents.search._dates import _date_only_range
from documents.search._dates import _datetime_range
from documents.search._query import build_permission_filter
from documents.search._query import normalize_query
from documents.search._query import parse_simple_text_highlight_query
from documents.search._query import parse_user_query
from documents.search._query import rewrite_natural_date_keywords
from documents.search._schema import build_schema
from documents.search._tokenizer import register_tokenizers
from documents.search._translate import InvalidDateQuery
from documents.search._translate import translate_query
if TYPE_CHECKING:
from django.contrib.auth.base_user import AbstractBaseUser
@@ -57,7 +56,7 @@ class TestCreatedDateField:
)
@time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
def test_today(self, tz: tzinfo, expected_lo: str, expected_hi: str) -> None:
lo, hi = _range(rewrite_natural_date_keywords("created:today", tz), "created")
lo, hi = _range(translate_query("created:today", tz), "created")
assert lo == expected_lo
assert hi == expected_hi
@@ -65,7 +64,7 @@ class TestCreatedDateField:
def test_today_auckland_ahead_of_utc(self) -> None:
# UTC 03:00 -> Auckland (UTC+13) = 16:00 same date; local date = 2026-03-28
lo, _ = _range(
rewrite_natural_date_keywords("created:today", AUCKLAND),
translate_query("created:today", AUCKLAND),
"created",
)
assert lo == "2026-03-28T00:00:00Z"
@@ -127,7 +126,7 @@ class TestCreatedDateField:
) -> None:
# 2026-03-28 is Saturday; Mon-Sun week calculation built into expectations
query = f"{field}:{keyword}"
lo, hi = _range(rewrite_natural_date_keywords(query, UTC), field)
lo, hi = _range(translate_query(query, UTC), field)
assert lo == expected_lo
assert hi == expected_hi
@@ -135,7 +134,7 @@ class TestCreatedDateField:
def test_this_month_december_wraps_to_next_year(self) -> None:
# December: next month must roll over to January 1 of next year
lo, hi = _range(
rewrite_natural_date_keywords("created:this month", UTC),
translate_query("created:this month", UTC),
"created",
)
assert lo == "2026-12-01T00:00:00Z"
@@ -145,7 +144,7 @@ class TestCreatedDateField:
def test_last_month_january_wraps_to_previous_year(self) -> None:
# January: last month must roll back to December 1 of previous year
lo, hi = _range(
rewrite_natural_date_keywords("created:previous month", UTC),
translate_query("created:previous month", UTC),
"created",
)
assert lo == "2025-12-01T00:00:00Z"
@@ -154,7 +153,7 @@ class TestCreatedDateField:
@time_machine.travel(datetime(2026, 7, 15, 12, 0, tzinfo=UTC), tick=False)
def test_previous_quarter(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords('created:"previous quarter"', UTC),
translate_query('created:"previous quarter"', UTC),
"created",
)
assert lo == "2026-04-01T00:00:00Z"
@@ -174,7 +173,7 @@ class TestDateTimeFields:
@time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
def test_added_today_eastern(self) -> None:
# EDT = UTC-4; local midnight 2026-03-28 00:00 EDT = 2026-03-28 04:00 UTC
lo, hi = _range(rewrite_natural_date_keywords("added:today", EASTERN), "added")
lo, hi = _range(translate_query("added:today", EASTERN), "added")
assert lo == "2026-03-28T04:00:00Z"
assert hi == "2026-03-29T04:00:00Z"
@@ -182,14 +181,14 @@ class TestDateTimeFields:
def test_added_today_auckland_midnight_crossing(self) -> None:
# UTC 02:00 on 2026-03-29 -> Auckland (UTC+13) = 2026-03-29 15:00 local
# Auckland midnight = UTC 2026-03-28 11:00
lo, hi = _range(rewrite_natural_date_keywords("added:today", AUCKLAND), "added")
lo, hi = _range(translate_query("added:today", AUCKLAND), "added")
assert lo == "2026-03-28T11:00:00Z"
assert hi == "2026-03-29T11:00:00Z"
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
def test_modified_today_utc(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords("modified:today", UTC),
translate_query("modified:today", UTC),
"modified",
)
assert lo == "2026-03-28T00:00:00Z"
@@ -244,14 +243,14 @@ class TestDateTimeFields:
expected_hi: str,
) -> None:
# 2026-03-28 is Saturday; weekday()==5 so Monday=2026-03-23
lo, hi = _range(rewrite_natural_date_keywords(f"added:{keyword}", UTC), "added")
lo, hi = _range(translate_query(f"added:{keyword}", UTC), "added")
assert lo == expected_lo
assert hi == expected_hi
@time_machine.travel(datetime(2026, 12, 15, 12, 0, tzinfo=UTC), tick=False)
def test_this_month_december_wraps_to_next_year(self) -> None:
# December: next month wraps to January of next year
lo, hi = _range(rewrite_natural_date_keywords("added:this month", UTC), "added")
lo, hi = _range(translate_query("added:this month", UTC), "added")
assert lo == "2026-12-01T00:00:00Z"
assert hi == "2027-01-01T00:00:00Z"
@@ -259,7 +258,7 @@ class TestDateTimeFields:
def test_last_month_january_wraps_to_previous_year(self) -> None:
# January: last month wraps back to December of previous year
lo, hi = _range(
rewrite_natural_date_keywords("added:previous month", UTC),
translate_query("added:previous month", UTC),
"added",
)
assert lo == "2025-12-01T00:00:00Z"
@@ -295,7 +294,7 @@ class TestDateTimeFields:
expected_lo: str,
expected_hi: str,
) -> None:
lo, hi = _range(rewrite_natural_date_keywords(query, UTC), "added")
lo, hi = _range(translate_query(query, UTC), "added")
assert lo == expected_lo
assert hi == expected_hi
@@ -309,20 +308,20 @@ class TestWhooshQueryRewriting:
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
def test_compact_date_shim_rewrites_to_iso(self) -> None:
result = rewrite_natural_date_keywords("created:20240115120000", UTC)
result = translate_query("created:20240115120000", UTC)
assert "2024-01-15" in result
assert "20240115120000" not in result
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
def test_relative_range_shim_removes_now(self) -> None:
result = rewrite_natural_date_keywords("added:[now-7d TO now]", UTC)
result = translate_query("added:[now-7d TO now]", UTC)
assert "now" not in result
assert "2026-03-" in result
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
def test_bracket_minus_7_days(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords("added:[-7 days to now]", UTC),
translate_query("added:[-7 days to now]", UTC),
"added",
)
assert lo == "2026-03-21T12:00:00Z"
@@ -331,7 +330,7 @@ class TestWhooshQueryRewriting:
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
def test_bracket_minus_1_week(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords("added:[-1 week to now]", UTC),
translate_query("added:[-1 week to now]", UTC),
"added",
)
assert lo == "2026-03-21T12:00:00Z"
@@ -341,7 +340,7 @@ class TestWhooshQueryRewriting:
def test_bracket_minus_1_month_uses_relativedelta(self) -> None:
# relativedelta(months=1) from 2026-03-28 = 2026-02-28 (not 29)
lo, hi = _range(
rewrite_natural_date_keywords("created:[-1 month to now]", UTC),
translate_query("created:[-1 month to now]", UTC),
"created",
)
assert lo == "2026-02-28T12:00:00Z"
@@ -350,7 +349,7 @@ class TestWhooshQueryRewriting:
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
def test_bracket_minus_1_year(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords("modified:[-1 year to now]", UTC),
translate_query("modified:[-1 year to now]", UTC),
"modified",
)
assert lo == "2025-03-28T12:00:00Z"
@@ -359,7 +358,7 @@ class TestWhooshQueryRewriting:
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
def test_bracket_plural_unit_hours(self) -> None:
lo, hi = _range(
rewrite_natural_date_keywords("added:[-3 hours to now]", UTC),
translate_query("added:[-3 hours to now]", UTC),
"added",
)
assert lo == "2026-03-28T09:00:00Z"
@@ -367,7 +366,7 @@ class TestWhooshQueryRewriting:
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
def test_bracket_case_insensitive(self) -> None:
result = rewrite_natural_date_keywords("added:[-1 WEEK TO NOW]", UTC)
result = translate_query("added:[-1 WEEK TO NOW]", UTC)
assert "now" not in result.lower()
lo, hi = _range(result, "added")
assert lo == "2026-03-21T12:00:00Z"
@@ -377,7 +376,7 @@ class TestWhooshQueryRewriting:
def test_relative_range_swaps_bounds_when_lo_exceeds_hi(self) -> None:
# [now+1h TO now-1h] has lo > hi before substitution; they must be swapped
lo, hi = _range(
rewrite_natural_date_keywords("added:[now+1h TO now-1h]", UTC),
translate_query("added:[now+1h TO now-1h]", UTC),
"added",
)
assert lo == "2026-03-28T11:00:00Z"
@@ -385,14 +384,14 @@ class TestWhooshQueryRewriting:
def test_8digit_created_date_field_always_uses_utc_midnight(self) -> None:
# created is a DateField: boundaries are always UTC midnight, no TZ offset
result = rewrite_natural_date_keywords("created:20231201", EASTERN)
result = translate_query("created:20231201", EASTERN)
lo, hi = _range(result, "created")
assert lo == "2023-12-01T00:00:00Z"
assert hi == "2023-12-02T00:00:00Z"
def test_8digit_added_datetime_field_converts_local_midnight_to_utc(self) -> None:
# added is DateTimeField: midnight Dec 1 Eastern (EST = UTC-5) = 05:00 UTC
result = rewrite_natural_date_keywords("added:20231201", EASTERN)
result = translate_query("added:20231201", EASTERN)
lo, hi = _range(result, "added")
assert lo == "2023-12-01T05:00:00Z"
assert hi == "2023-12-02T05:00:00Z"
@@ -400,7 +399,7 @@ class TestWhooshQueryRewriting:
def test_8digit_modified_datetime_field_converts_local_midnight_to_utc(
self,
) -> None:
result = rewrite_natural_date_keywords("modified:20231201", EASTERN)
result = translate_query("modified:20231201", EASTERN)
lo, hi = _range(result, "modified")
assert lo == "2023-12-01T05:00:00Z"
assert hi == "2023-12-02T05:00:00Z"
@@ -410,7 +409,7 @@ class TestWhooshQueryRewriting:
# (e.g. month=13) so the API can surface a 400 telling the user the date
# is malformed instead of silently returning zero results.
with pytest.raises(InvalidDateQuery) as exc_info:
rewrite_natural_date_keywords("added:20231340", UTC)
translate_query("added:20231340", UTC)
assert exc_info.value.field == "added"
assert exc_info.value.value == "20231340"
@@ -577,7 +576,7 @@ class TestYearRangeRewriting:
expected_lo: str,
expected_hi: str,
) -> None:
result = rewrite_natural_date_keywords(query, UTC)
result = translate_query(query, UTC)
lo, hi = _range(result, field)
assert lo == expected_lo
assert hi == expected_hi
@@ -585,14 +584,14 @@ class TestYearRangeRewriting:
def test_reversed_year_range_is_swapped(self) -> None:
# A reversed range must not yield lo > hi, which Tantivy treats as an
# empty range (silently zero results). The bounds are swapped instead.
result = rewrite_natural_date_keywords("created:[2025 TO 2020]", UTC)
result = translate_query("created:[2025 TO 2020]", UTC)
lo, hi = _range(result, "created")
assert lo == "2020-01-01T00:00:00Z"
assert hi == "2026-01-01T00:00:00Z"
def test_year_range_in_complex_boolean_query(self) -> None:
query = "tag:steuer AND (title:2020 OR (NOT title:2019 AND NOT title:2018 AND created:[2020 TO 2020]))"
result = rewrite_natural_date_keywords(query, UTC)
result = translate_query(query, UTC)
lo, hi = _range(result, "created")
assert lo == "2020-01-01T00:00:00Z"
assert hi == "2021-01-01T00:00:00Z"
@@ -602,7 +601,7 @@ class TestYearRangeRewriting:
def test_already_iso_date_range_passes_through_unchanged(self) -> None:
original = "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]"
assert rewrite_natural_date_keywords(original, UTC) == original
assert translate_query(original, UTC) == original
def test_8digit_in_brackets_not_matched_as_year_range(self) -> None:
# [YYYYMMDD TO YYYYMMDD]: the translation layer converts 8-digit bounds to
@@ -611,7 +610,7 @@ class TestYearRangeRewriting:
# This is the correct and accepted behavior: old compact form becomes a
# proper Tantivy-parseable ISO range.
original = "created:[20200101 TO 20201231]"
result = rewrite_natural_date_keywords(original, UTC)
result = translate_query(original, UTC)
lo, hi = _range(result, "created")
assert lo == "2020-01-01T00:00:00Z"
assert hi == "2021-01-01T00:00:00Z"
@@ -634,7 +633,7 @@ class TestNonDateFieldsNotRewritten:
],
)
def test_8digit_on_integer_field_passes_through_unchanged(self, query: str) -> None:
assert rewrite_natural_date_keywords(query, EASTERN) == query
assert translate_query(query, EASTERN) == query
@pytest.mark.parametrize(
"query",
@@ -648,12 +647,12 @@ class TestNonDateFieldsNotRewritten:
self,
query: str,
) -> None:
assert rewrite_natural_date_keywords(query, UTC) == query
assert translate_query(query, UTC) == query
def test_unknown_field_keyword_passes_through_unchanged(self) -> None:
# foobar is not a date field: 'foobar:today' must not become a date range,
# which Tantivy would otherwise reject as an unknown/typed field.
assert rewrite_natural_date_keywords("foobar:today", UTC) == "foobar:today"
assert translate_query("foobar:today", UTC) == "foobar:today"
class TestPassthrough:
@@ -661,37 +660,39 @@ class TestPassthrough:
def test_bare_keyword_no_field_prefix_unchanged(self) -> None:
# Bare 'today' with no field: prefix passes through unchanged
result = rewrite_natural_date_keywords("bank statement today", UTC)
result = translate_query("bank statement today", UTC)
assert "today" in result
def test_unrelated_query_unchanged(self) -> None:
assert rewrite_natural_date_keywords("title:invoice", UTC) == "title:invoice"
assert translate_query("title:invoice", UTC) == "title:invoice"
class TestNormalizeQuery:
"""normalize_query expands comma-separated values and collapses whitespace."""
"""translate_query expands comma-separated values and collapses whitespace."""
def test_normalize_expands_comma_separated_tags(self) -> None:
assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar"
assert translate_query("tag:foo,bar", UTC) == "tag:foo AND tag:bar"
def test_normalize_comma_between_range_expressions(self) -> None:
# Comma-separated field range expressions (Whoosh v2 syntax) must be
# converted to AND so Tantivy does not receive an invalid comma.
q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
assert normalize_query(q) == (
assert translate_query(q, UTC) == (
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
" AND "
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
)
def test_normalize_expands_three_values(self) -> None:
assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz"
assert (
translate_query("tag:foo,bar,baz", UTC) == "tag:foo AND tag:bar AND tag:baz"
)
def test_normalize_collapses_whitespace(self) -> None:
assert normalize_query("bank statement") == "bank statement"
assert translate_query("bank statement", UTC) == "bank statement"
def test_normalize_no_commas_unchanged(self) -> None:
assert normalize_query("bank statement") == "bank statement"
assert translate_query("bank statement", UTC) == "bank statement"
@pytest.mark.parametrize(
("raw", "expected"),
@@ -734,7 +735,7 @@ class TestNormalizeQuery:
],
)
def test_normalize_strips_dangling_operators(self, raw: str, expected: str) -> None:
assert normalize_query(raw) == expected
assert translate_query(raw, UTC) == expected
@pytest.mark.parametrize(
"query",
@@ -746,7 +747,7 @@ class TestNormalizeQuery:
],
)
def test_normalize_preserves_valid_operators(self, query: str) -> None:
assert normalize_query(query) == query
assert translate_query(query, UTC) == query
class TestParseSimpleTextHighlightQuery:
@@ -82,7 +82,6 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
"llm_api_key": None,
"llm_endpoint": None,
"llm_output_language": None,
"llm_request_timeout": None,
},
)
@@ -684,7 +684,6 @@ class ConsumerThread(Thread):
subdirs_as_tags: bool = False,
polling_interval: float = 0,
stability_delay: float = 0.1,
rescan_interval: float | None = None,
) -> None:
super().__init__()
self.consumption_dir = consumption_dir
@@ -694,8 +693,6 @@ class ConsumerThread(Thread):
self.polling_interval = polling_interval
self.stability_delay = stability_delay
self.cmd = Command()
if rescan_interval is not None:
self.cmd.rescan_interval_s = rescan_interval
self.cmd.stop_flag.clear()
# Non-daemon ensures finally block runs and connections are closed
self.daemon = False
@@ -1055,200 +1052,3 @@ class TestCommandWatchEdgeCases:
thread.stop_and_wait(timeout=5.0)
# Clean up any Tags created by the thread
Tag.objects.all().delete()
class TestRescanExistingFiles:
"""
Unit tests for the rescan safety net.
Each ``watch()`` recreation silently adopts the current directory contents
as its baseline, so a file appearing between one batch and the next
watcher's baseline is never reported and would sit in the consume directory
forever. ``_rescan_existing_files`` re-injects such files into the
stability tracker as a periodic safety net (see GH issue #13011).
"""
@pytest.fixture
def pdf_only_filter(self) -> ConsumerFilter:
return ConsumerFilter(
supported_extensions=frozenset({".pdf"}),
ignore_patterns=[],
)
def _rescan(
self,
directory: Path,
consumer_filter: ConsumerFilter,
tracker: FileStabilityTracker,
queued: set[Path],
*,
recursive: bool = False,
) -> None:
Command()._rescan_existing_files(
directory=directory,
recursive=recursive,
consumer_filter=consumer_filter,
tracker=tracker,
queued=queued,
)
def test_tracks_stranded_file(
self,
consumption_dir: Path,
sample_pdf: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""A supported on-disk file the watcher never reported gets tracked."""
target = consumption_dir / "stranded.pdf"
shutil.copy(sample_pdf, target)
tracker = FileStabilityTracker(stability_delay=0.1)
self._rescan(consumption_dir, pdf_only_filter, tracker, set())
assert tracker.is_tracking(target) is True
assert tracker.pending_count == 1
def test_skips_already_tracked_file(
self,
consumption_dir: Path,
sample_pdf: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""A file already being tracked by the watcher is not double-tracked."""
target = consumption_dir / "tracked.pdf"
shutil.copy(sample_pdf, target)
tracker = FileStabilityTracker(stability_delay=0.1)
tracker.track(target, Change.added)
self._rescan(consumption_dir, pdf_only_filter, tracker, set())
assert tracker.pending_count == 1
def test_skips_queued_file(
self,
consumption_dir: Path,
sample_pdf: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""A file already queued and awaiting consumption is not re-tracked."""
target = consumption_dir / "inflight.pdf"
shutil.copy(sample_pdf, target)
tracker = FileStabilityTracker(stability_delay=0.1)
queued = {target.resolve()}
self._rescan(consumption_dir, pdf_only_filter, tracker, queued)
assert tracker.pending_count == 0
def test_prunes_vanished_queued_paths(
self,
consumption_dir: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""Queued paths no longer on disk are dropped so the name can recur."""
gone = (consumption_dir / "gone.pdf").resolve()
tracker = FileStabilityTracker(stability_delay=0.1)
queued = {gone}
self._rescan(consumption_dir, pdf_only_filter, tracker, queued)
assert gone not in queued
def test_skips_unsupported_extension(
self,
consumption_dir: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""Files filtered out by the consumer filter are not tracked."""
(consumption_dir / "notes.xyz").write_bytes(b"content")
tracker = FileStabilityTracker(stability_delay=0.1)
self._rescan(consumption_dir, pdf_only_filter, tracker, set())
assert tracker.pending_count == 0
def test_recursive_respects_flag(
self,
consumption_dir: Path,
sample_pdf: Path,
pdf_only_filter: ConsumerFilter,
) -> None:
"""Nested files are only found when recursive scanning is enabled."""
subdir = consumption_dir / "nested"
subdir.mkdir()
target = subdir / "deep.pdf"
shutil.copy(sample_pdf, target)
shallow = FileStabilityTracker(stability_delay=0.1)
self._rescan(consumption_dir, pdf_only_filter, shallow, set())
assert shallow.pending_count == 0
deep = FileStabilityTracker(stability_delay=0.1)
self._rescan(consumption_dir, pdf_only_filter, deep, set(), recursive=True)
assert deep.is_tracking(target) is True
class TestProcessExistingFilesQueued:
"""Tests that startup processing reports which paths it queued."""
@pytest.mark.usefixtures("mock_supported_extensions")
def test_returns_queued_paths(
self,
consumption_dir: Path,
sample_pdf: Path,
mock_consume_file_delay: MagicMock,
settings: SettingsWrapper,
) -> None:
"""The set returned seeds the rescan's queued set, avoiding re-queue."""
target = consumption_dir / "document.pdf"
shutil.copy(sample_pdf, target)
settings.CONSUMER_IGNORE_PATTERNS = []
queued = Command()._process_existing_files(
directory=consumption_dir,
recursive=False,
subdirs_as_tags=False,
consumer_filter=ConsumerFilter(ignore_patterns=[]),
)
assert target.resolve() in queued
@pytest.mark.management
@pytest.mark.django_db
class TestCommandRescanRecovery:
"""End-to-end test that the rescan recovers files the watcher misses."""
def test_rescan_consumes_file_the_watcher_never_reports(
self,
consumption_dir: Path,
sample_pdf: Path,
mock_consume_file_delay: MagicMock,
start_consumer: Callable[..., ConsumerThread],
) -> None:
"""
Isolate the rescan path: a long polling interval guarantees the
watcher cannot report the file within the test window, so only the
periodic rescan can consume it.
"""
# poll interval far longer than the test window -> watcher stays silent
thread = start_consumer(
polling_interval=30.0,
stability_delay=0.1,
rescan_interval=0.5,
)
# created after startup, so _process_existing_files did not see it
target = consumption_dir / "stranded.pdf"
shutil.copy(sample_pdf, target)
wait_for_mock_call(mock_consume_file_delay.apply_async, timeout_s=5.0)
if thread.exception:
raise thread.exception
mock_consume_file_delay.apply_async.assert_called()
call_args = mock_consume_file_delay.apply_async.call_args.kwargs["kwargs"][
"input_doc"
]
assert call_args.original_file.name == "stranded.pdf"
-28
View File
@@ -30,7 +30,6 @@ from documents.signals.handlers import update_llm_suggestions_cache
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import read_streaming_response
from paperless.models import ApplicationConfiguration
from paperless_ai.exceptions import LLMTimeoutError
class TestViews(DirectoriesMixin, TestCase):
@@ -477,33 +476,6 @@ class TestAISuggestions(DirectoriesMixin, TestCase):
get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
)
@patch("documents.views.get_ai_document_classification")
@override_settings(
AI_ENABLED=True,
LLM_BACKEND="openai-like",
)
def test_ai_suggestions_with_llm_timeout(
self,
mock_get_ai_classification,
) -> None:
mock_get_ai_classification.side_effect = LLMTimeoutError()
self.client.force_login(user=self.user)
response = self.client.get(
f"/api/documents/{self.document.pk}/ai_suggestions/",
)
self.assertEqual(response.status_code, status.HTTP_503_SERVICE_UNAVAILABLE)
self.assertEqual(
response.json(),
{
"ai": ["AI backend request timed out."],
},
)
self.assertIsNone(
get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
)
def test_invalidate_suggestions_cache(self) -> None:
self.client.force_login(user=self.user)
suggestions = {
-12
View File
@@ -241,7 +241,6 @@ from paperless.serialisers import UserSerializer
from paperless.views import StandardPagination
from paperless_ai.ai_classifier import get_ai_document_classification
from paperless_ai.chat import stream_chat_with_documents
from paperless_ai.exceptions import LLMTimeoutError
from paperless_ai.matching import extract_unmatched_names
from paperless_ai.matching import match_correspondents_by_name
from paperless_ai.matching import match_document_types_by_name
@@ -1511,17 +1510,6 @@ class DocumentViewSet(
exc_info=True,
)
raise ValidationError({"ai": [_("Invalid AI configuration.")]}) from exc
except LLMTimeoutError as exc:
logger.exception(
"AI backend timed out while generating suggestions for document %s: %s",
doc.pk,
exc,
exc_info=True,
)
return Response(
{"ai": [_("AI backend request timed out.")]},
status=status.HTTP_503_SERVICE_UNAVAILABLE,
)
matched_tags = match_tags_by_name(
llm_suggestions.get("tags", []),
-4
View File
@@ -197,7 +197,6 @@ class AIConfig(BaseConfig):
llm_embedding_endpoint: str = dataclasses.field(init=False)
llm_embedding_chunk_size: int = dataclasses.field(init=False)
llm_context_size: int = dataclasses.field(init=False)
llm_request_timeout: int = dataclasses.field(init=False)
llm_backend: str = dataclasses.field(init=False)
llm_model: str = dataclasses.field(init=False)
llm_api_key: str = dataclasses.field(init=False)
@@ -222,9 +221,6 @@ class AIConfig(BaseConfig):
app_config.llm_embedding_chunk_size or settings.LLM_EMBEDDING_CHUNK_SIZE
)
self.llm_context_size = app_config.llm_context_size or settings.LLM_CONTEXT_SIZE
self.llm_request_timeout = (
app_config.llm_request_timeout or settings.LLM_REQUEST_TIMEOUT
)
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
self.llm_model = app_config.llm_model or settings.LLM_MODEL
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
@@ -1,365 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-04 15:30
import django.core.validators
from django.db import migrations
from django.db import models
def _create_singleton(apps, schema_editor):
settings_model = apps.get_model("paperless", "ApplicationConfiguration")
settings_model.objects.create()
class Migration(migrations.Migration):
replaces = [
("paperless", "0001_initial"),
("paperless", "0002_applicationconfiguration_app_logo_and_more"),
("paperless", "0003_alter_applicationconfiguration_max_image_pixels"),
("paperless", "0004_applicationconfiguration_barcode_asn_prefix_and_more"),
("paperless", "0005_applicationconfiguration_ai_enabled_and_more"),
("paperless", "0006_applicationconfiguration_barcode_tag_split"),
]
dependencies = []
operations = [
migrations.CreateModel(
name="ApplicationConfiguration",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"output_type",
models.CharField(
blank=True,
choices=[
("pdf", "pdf"),
("pdfa", "pdfa"),
("pdfa-1", "pdfa-1"),
("pdfa-2", "pdfa-2"),
("pdfa-3", "pdfa-3"),
],
max_length=8,
null=True,
verbose_name="Sets the output PDF type",
),
),
(
"pages",
models.PositiveIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Do OCR from page 1 to this value",
),
),
(
"language",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Do OCR using these languages",
),
),
(
"mode",
models.CharField(
blank=True,
choices=[
("skip", "skip"),
("redo", "redo"),
("force", "force"),
("skip_noarchive", "skip_noarchive"),
],
max_length=16,
null=True,
verbose_name="Sets the OCR mode",
),
),
(
"skip_archive_file",
models.CharField(
blank=True,
choices=[
("never", "never"),
("with_text", "with_text"),
("always", "always"),
],
max_length=16,
null=True,
verbose_name="Controls the generation of an archive file",
),
),
(
"image_dpi",
models.PositiveIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets image DPI fallback value",
),
),
(
"unpaper_clean",
models.CharField(
blank=True,
choices=[
("clean", "clean"),
("clean-final", "clean-final"),
("none", "none"),
],
max_length=16,
null=True,
verbose_name="Controls the unpaper cleaning",
),
),
(
"deskew",
models.BooleanField(null=True, verbose_name="Enables deskew"),
),
(
"rotate_pages",
models.BooleanField(
null=True,
verbose_name="Enables page rotation",
),
),
(
"rotate_pages_threshold",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the threshold for rotation of pages",
),
),
(
"max_image_pixels",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the maximum image size for decompression",
),
),
(
"color_conversion_strategy",
models.CharField(
blank=True,
choices=[
("LeaveColorUnchanged", "LeaveColorUnchanged"),
("RGB", "RGB"),
("UseDeviceIndependentColor", "UseDeviceIndependentColor"),
("Gray", "Gray"),
("CMYK", "CMYK"),
],
max_length=32,
null=True,
verbose_name="Sets the Ghostscript color conversion strategy",
),
),
(
"user_args",
models.JSONField(
null=True,
verbose_name="Adds additional user arguments for OCRMyPDF",
),
),
(
"app_logo",
models.FileField(
blank=True,
null=True,
upload_to="logo/",
validators=[
django.core.validators.FileExtensionValidator(
allowed_extensions=["jpg", "png", "gif", "svg"],
),
],
verbose_name="Application logo",
),
),
(
"app_title",
models.CharField(
blank=True,
max_length=48,
null=True,
verbose_name="Application title",
),
),
(
"barcode_asn_prefix",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the ASN barcode prefix",
),
),
(
"barcode_dpi",
models.PositiveIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the barcode DPI",
),
),
(
"barcode_enable_asn",
models.BooleanField(
null=True,
verbose_name="Enables ASN barcode",
),
),
(
"barcode_enable_tag",
models.BooleanField(
null=True,
verbose_name="Enables tag barcode",
),
),
(
"barcode_enable_tiff_support",
models.BooleanField(
null=True,
verbose_name="Enables barcode TIFF support",
),
),
(
"barcode_max_pages",
models.PositiveIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the maximum pages for barcode",
),
),
(
"barcode_retain_split_pages",
models.BooleanField(
null=True,
verbose_name="Retains split pages",
),
),
(
"barcode_string",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the barcode string",
),
),
(
"barcode_tag_mapping",
models.JSONField(
null=True,
verbose_name="Sets the tag barcode mapping",
),
),
(
"barcode_upscale",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(1.0)],
verbose_name="Sets the barcode upscale factor",
),
),
(
"barcodes_enabled",
models.BooleanField(
null=True,
verbose_name="Enables barcode scanning",
),
),
(
"ai_enabled",
models.BooleanField(
default=False,
null=True,
verbose_name="Enables AI features",
),
),
(
"llm_api_key",
models.CharField(
blank=True,
max_length=1024,
null=True,
verbose_name="Sets the LLM API key",
),
),
(
"llm_backend",
models.CharField(
blank=True,
choices=[
("openai-like", "OpenAI-compatible"),
("ollama", "Ollama"),
],
max_length=128,
null=True,
verbose_name="Sets the LLM backend",
),
),
(
"llm_embedding_backend",
models.CharField(
blank=True,
choices=[
("openai-like", "OpenAI-compatible"),
("huggingface", "Huggingface"),
],
max_length=128,
null=True,
verbose_name="Sets the LLM embedding backend",
),
),
(
"llm_embedding_model",
models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="Sets the LLM embedding model",
),
),
(
"llm_endpoint",
models.CharField(
blank=True,
max_length=256,
null=True,
verbose_name="Sets the LLM endpoint, optional",
),
),
(
"llm_model",
models.CharField(
blank=True,
max_length=128,
null=True,
verbose_name="Sets the LLM model",
),
),
(
"barcode_tag_split",
models.BooleanField(
null=True,
verbose_name="Enables splitting on tag barcodes",
),
),
],
options={
"verbose_name": "paperless application settings",
},
),
migrations.RunPython(
code=_create_singleton,
reverse_code=migrations.RunPython.noop,
),
]
@@ -1,94 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-04 15:19
import django.core.validators
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("paperless", "0009_alter_applicationconfiguration_options"),
("paperless", "0010_alter_applicationconfiguration_llm_embedding_backend"),
("paperless", "0011_applicationconfiguration_llm_embedding_chunk_size"),
("paperless", "0012_applicationconfiguration_llm_output_language"),
("paperless", "0013_applicationconfiguration_llm_request_timeout"),
]
dependencies = [
("paperless", "0008_replace_skip_archive_file"),
]
operations = [
migrations.AlterModelOptions(
name="applicationconfiguration",
options={
"permissions": [
("view_global_statistics", "Can view global object counts"),
("view_system_monitoring", "Can view system status information"),
],
"verbose_name": "paperless application settings",
},
),
migrations.AlterField(
model_name="applicationconfiguration",
name="llm_embedding_backend",
field=models.CharField(
blank=True,
choices=[
("openai-like", "OpenAI-compatible"),
("huggingface", "Huggingface"),
("ollama", "Ollama"),
],
max_length=128,
null=True,
verbose_name="Sets the LLM embedding backend",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_endpoint",
field=models.CharField(
blank=True,
max_length=256,
null=True,
verbose_name="Sets the LLM embedding endpoint, optional",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_chunk_size",
field=models.PositiveSmallIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the LLM embedding chunk size",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_context_size",
field=models.PositiveIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the LLM context size",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_output_language",
field=models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the LLM output language",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_request_timeout",
field=models.PositiveSmallIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the LLM request timeout in seconds",
),
),
]
@@ -1,23 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-14 14:22
import django.core.validators
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("paperless", "0012_applicationconfiguration_llm_output_language"),
]
operations = [
migrations.AddField(
model_name="applicationconfiguration",
name="llm_request_timeout",
field=models.PositiveSmallIntegerField(
null=True,
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="Sets the LLM request timeout in seconds",
),
),
]
-6
View File
@@ -366,12 +366,6 @@ class ApplicationConfiguration(AbstractSingletonModel):
max_length=32,
)
llm_request_timeout = models.PositiveSmallIntegerField(
verbose_name=_("Sets the LLM timeout in seconds"),
null=True,
validators=[MinValueValidator(1)],
)
class Meta:
verbose_name = _("paperless application settings")
permissions = [
-3
View File
@@ -1206,9 +1206,6 @@ if LLM_EMBEDDING_CHUNK_SIZE < 1:
LLM_CONTEXT_SIZE = get_int_from_env("PAPERLESS_AI_LLM_CONTEXT_SIZE", 8192)
if LLM_CONTEXT_SIZE < 1:
raise ImproperlyConfigured("PAPERLESS_AI_LLM_CONTEXT_SIZE must be >= 1")
LLM_REQUEST_TIMEOUT = get_int_from_env("PAPERLESS_AI_LLM_REQUEST_TIMEOUT", 120)
if LLM_REQUEST_TIMEOUT < 1:
raise ImproperlyConfigured("PAPERLESS_AI_LLM_REQUEST_TIMEOUT must be >= 1")
LLM_BACKEND = get_choice_from_env(
"PAPERLESS_AI_LLM_BACKEND",
{"ollama", "openai-like"},
+28 -49
View File
@@ -1,14 +1,11 @@
import json
import logging
from collections.abc import Iterator
from contextlib import contextmanager
from typing import TYPE_CHECKING
import httpx
from paperless.models import LLMBackend
if TYPE_CHECKING:
from llama_index.core.llms import ChatMessage
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai_like import OpenAILike
@@ -19,7 +16,6 @@ from paperless.network import create_pinned_async_httpx_client
from paperless.network import create_pinned_httpx_client
from paperless.network import validate_outbound_http_url
from paperless_ai.base_model import DocumentClassifierSchema
from paperless_ai.exceptions import LLMTimeoutError
logger = logging.getLogger("paperless_ai.client")
@@ -65,16 +61,16 @@ class AIClient:
model=self.settings.llm_model or "llama3.1",
base_url=endpoint,
context_window=self.settings.llm_context_size,
request_timeout=self.settings.llm_request_timeout,
request_timeout=120,
system_prompt=LLM_SYSTEM_PROMPT,
client=Client(
host=endpoint,
timeout=self.settings.llm_request_timeout,
timeout=120,
transport=transport,
),
async_client=AsyncClient(
host=endpoint,
timeout=self.settings.llm_request_timeout,
timeout=120,
transport=async_transport,
),
)
@@ -88,18 +84,15 @@ class AIClient:
http_client = create_pinned_httpx_client(
endpoint,
allow_internal=self.settings.llm_allow_internal_endpoints,
timeout=self.settings.llm_request_timeout,
)
async_http_client = create_pinned_async_httpx_client(
endpoint,
allow_internal=self.settings.llm_allow_internal_endpoints,
timeout=self.settings.llm_request_timeout,
)
return OpenAILike(
model=self.settings.llm_model or "gpt-3.5-turbo",
api_base=endpoint,
api_key=self.settings.llm_api_key,
timeout=self.settings.llm_request_timeout,
is_chat_model=True,
is_function_calling_model=True,
system_prompt=LLM_SYSTEM_PROMPT,
@@ -120,12 +113,11 @@ class AIClient:
user_msg = ChatMessage(role="user", content=prompt)
if self.settings.llm_backend == LLMBackend.OLLAMA:
with self._normalize_timeouts():
result = self.llm.chat(
[user_msg],
format=DocumentClassifierSchema.model_json_schema(),
think=False,
)
result = self.llm.chat(
[user_msg],
format=DocumentClassifierSchema.model_json_schema(),
think=False,
)
logger.debug("LLM query result: %s", result)
parsed = DocumentClassifierSchema(**json.loads(result.message.content))
return parsed.model_dump()
@@ -133,39 +125,26 @@ class AIClient:
from llama_index.core.program.function_program import get_function_tool
tool = get_function_tool(DocumentClassifierSchema)
with self._normalize_timeouts():
result = self.llm.chat_with_tools(
tools=[tool],
user_msg=user_msg,
chat_history=[],
allow_parallel_tool_calls=True,
tool_required=True,
)
tool_calls = self.llm.get_tool_calls_from_response(
result,
error_on_no_tool_call=True,
)
result = self.llm.chat_with_tools(
tools=[tool],
user_msg=user_msg,
chat_history=[],
allow_parallel_tool_calls=True,
)
tool_calls = self.llm.get_tool_calls_from_response(
result,
error_on_no_tool_call=True,
)
logger.debug("LLM query result: %s", tool_calls)
parsed = DocumentClassifierSchema(**tool_calls[0].tool_kwargs)
return parsed.model_dump()
@contextmanager
def _normalize_timeouts(self) -> Iterator[None]:
try:
yield
except httpx.TimeoutException as exc:
raise LLMTimeoutError from exc
except Exception as exc:
if self._is_openai_timeout(exc):
raise LLMTimeoutError from exc
raise
def _is_openai_timeout(self, exc: Exception) -> bool:
if self.settings.llm_backend != LLMBackend.OPENAI_LIKE:
return False
# Keep OpenAI imports out of module import paths and only load the SDK
# when translating an error from an OpenAI-backed request.
from openai import APITimeoutError
return isinstance(exc, APITimeoutError)
def run_chat(self, messages: list["ChatMessage"]) -> str:
logger.debug(
"Running chat query against %s with model %s",
self.settings.llm_backend,
self.settings.llm_model,
)
result = self.llm.chat(messages)
logger.debug("Chat result: %s", result)
return result
-5
View File
@@ -32,18 +32,15 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
http_client = create_pinned_httpx_client(
endpoint,
allow_internal=config.llm_allow_internal_endpoints,
timeout=config.llm_request_timeout,
)
async_http_client = create_pinned_async_httpx_client(
endpoint,
allow_internal=config.llm_allow_internal_endpoints,
timeout=config.llm_request_timeout,
)
return OpenAILikeEmbedding(
model_name=config.llm_embedding_model or "text-embedding-3-small",
api_key=config.llm_api_key,
api_base=endpoint,
timeout=config.llm_request_timeout,
http_client=http_client,
async_http_client=async_http_client,
)
@@ -76,14 +73,12 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
)
embedding._client = Client(
host=endpoint,
timeout=config.llm_request_timeout,
transport=PinnedHostHTTPTransport(
allow_internal=config.llm_allow_internal_endpoints,
),
)
embedding._async_client = AsyncClient(
host=endpoint,
timeout=config.llm_request_timeout,
transport=PinnedHostAsyncHTTPTransport(
allow_internal=config.llm_allow_internal_endpoints,
),
-2
View File
@@ -1,2 +0,0 @@
class LLMTimeoutError(Exception):
pass
+7 -32
View File
@@ -3,14 +3,12 @@ from unittest.mock import ANY
from unittest.mock import MagicMock
from unittest.mock import patch
import httpx
import openai
import pytest
from llama_index.core.llms import ChatMessage
from llama_index.core.llms.llm import ToolSelection
from paperless_ai.client import LLM_SYSTEM_PROMPT
from paperless_ai.client import AIClient
from paperless_ai.exceptions import LLMTimeoutError
@pytest.fixture
@@ -19,7 +17,6 @@ def mock_ai_config():
mock_config = MagicMock()
mock_config.llm_allow_internal_endpoints = True
mock_config.llm_context_size = 8192
mock_config.llm_request_timeout = 120
MockAIConfig.return_value = mock_config
yield mock_config
@@ -67,7 +64,6 @@ def test_get_llm_openai(mock_ai_config, mock_openai_llm):
model="test_model",
api_base="http://test-url",
api_key="test_api_key",
timeout=120,
is_chat_model=True,
is_function_calling_model=True,
system_prompt=LLM_SYSTEM_PROMPT,
@@ -155,38 +151,17 @@ def test_run_llm_query_openai_uses_tools(mock_ai_config, mock_openai_llm):
mock_llm_instance.chat_with_tools.assert_called_once()
def test_run_llm_query_openai_timeout_raises_local_error(
mock_ai_config,
mock_openai_llm,
):
mock_ai_config.llm_backend = "openai-like"
mock_ai_config.llm_model = "test_model"
mock_ai_config.llm_api_key = "test_api_key"
mock_ai_config.llm_endpoint = "http://test-url"
request = httpx.Request("POST", "http://test-url/v1/chat/completions")
mock_openai_llm.return_value.chat_with_tools.side_effect = openai.APITimeoutError(
request,
)
client = AIClient()
with pytest.raises(LLMTimeoutError):
client.run_llm_query("test_prompt")
def test_run_llm_query_httpx_timeout_raises_local_error(
mock_ai_config,
mock_ollama_llm,
):
def test_run_chat(mock_ai_config, mock_ollama_llm):
mock_ai_config.llm_backend = "ollama"
mock_ai_config.llm_model = "test_model"
mock_ai_config.llm_endpoint = "http://test-url"
mock_llm_instance = mock_ollama_llm.return_value
mock_llm_instance.chat.side_effect = httpx.ReadTimeout("timed out")
mock_llm_instance.chat.return_value = "test_chat_result"
client = AIClient()
messages = [ChatMessage(role="user", content="Hello")]
result = client.run_chat(messages)
with pytest.raises(LLMTimeoutError):
client.run_llm_query("test_prompt")
mock_llm_instance.chat.assert_called_once_with(messages)
assert result == "test_chat_result"
-3
View File
@@ -19,7 +19,6 @@ def mock_ai_config():
MockAIConfig.return_value.llm_embedding_endpoint = None
MockAIConfig.return_value.llm_allow_internal_endpoints = True
MockAIConfig.return_value.llm_context_size = 8192
MockAIConfig.return_value.llm_request_timeout = 120
yield MockAIConfig
@@ -72,7 +71,6 @@ def test_get_embedding_model_openai(mock_ai_config):
model_name="text-embedding-3-small",
api_key="test_api_key",
api_base="http://test-url",
timeout=120,
http_client=ANY,
async_http_client=ANY,
)
@@ -94,7 +92,6 @@ def test_get_embedding_model_openai_prefers_embedding_endpoint(mock_ai_config):
model_name="text-embedding-3-small",
api_key="test_api_key",
api_base="http://embedding-url",
timeout=120,
http_client=ANY,
async_http_client=ANY,
)
@@ -1,158 +0,0 @@
# Generated by Django 5.2.14 on 2026-06-04 15:10
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
replaces = [
("paperless_mail", "0002_optimize_integer_field_sizes"),
("paperless_mail", "0003_mailrule_stop_processing"),
]
dependencies = [
("paperless_mail", "0001_squashed"),
]
operations = [
migrations.AlterField(
model_name="mailaccount",
name="account_type",
field=models.PositiveSmallIntegerField(
choices=[(1, "IMAP"), (2, "Gmail OAuth"), (3, "Outlook OAuth")],
default=1,
verbose_name="account type",
),
),
migrations.AlterField(
model_name="mailaccount",
name="imap_port",
field=models.PositiveIntegerField(
blank=True,
help_text="This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections.",
null=True,
verbose_name="IMAP port",
),
),
migrations.AlterField(
model_name="mailaccount",
name="imap_security",
field=models.PositiveSmallIntegerField(
choices=[(1, "No encryption"), (2, "Use SSL"), (3, "Use STARTTLS")],
default=2,
verbose_name="IMAP security",
),
),
migrations.AlterField(
model_name="mailrule",
name="action",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Delete"),
(2, "Move to specified folder"),
(3, "Mark as read, don't process read mails"),
(4, "Flag the mail, don't process flagged mails"),
(5, "Tag the mail with specified tag, don't process tagged mails"),
],
default=3,
verbose_name="action",
),
),
migrations.AlterField(
model_name="mailrule",
name="assign_correspondent_from",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Do not assign a correspondent"),
(2, "Use mail address"),
(3, "Use name (or mail address if not available)"),
(4, "Use correspondent selected below"),
],
default=1,
verbose_name="assign correspondent from",
),
),
migrations.AlterField(
model_name="mailrule",
name="assign_title_from",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Use subject as title"),
(2, "Use attachment filename as title"),
(3, "Do not assign title from rule"),
],
default=1,
verbose_name="assign title from",
),
),
migrations.AlterField(
model_name="mailrule",
name="attachment_type",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Only process attachments."),
(2, "Process all files, including 'inline' attachments."),
],
default=1,
help_text="Inline attachments include embedded images, so it's best to combine this option with a filename filter.",
verbose_name="attachment type",
),
),
migrations.AlterField(
model_name="mailrule",
name="consumption_scope",
field=models.PositiveSmallIntegerField(
choices=[
(1, "Only process attachments."),
(
2,
"Process full Mail (with embedded attachments in file) as .eml",
),
(
3,
"Process full Mail (with embedded attachments in file) as .eml + process attachments as separate documents",
),
],
default=1,
verbose_name="consumption scope",
),
),
migrations.AlterField(
model_name="mailrule",
name="maximum_age",
field=models.PositiveSmallIntegerField(
default=30,
help_text="Specified in days.",
verbose_name="maximum age",
),
),
migrations.AlterField(
model_name="mailrule",
name="order",
field=models.SmallIntegerField(default=0, verbose_name="order"),
),
migrations.AlterField(
model_name="mailrule",
name="pdf_layout",
field=models.PositiveSmallIntegerField(
choices=[
(0, "System default"),
(1, "Text, then HTML"),
(2, "HTML, then text"),
(3, "HTML only"),
(4, "Text only"),
],
default=0,
verbose_name="pdf layout",
),
),
migrations.AddField(
model_name="mailrule",
name="stop_processing",
field=models.BooleanField(
default=False,
help_text="If True, no further rules will be processed after this one if any document is queued.",
verbose_name="Stop processing further rules",
),
),
]