mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-05 23:58:52 +00:00
Compare commits
7 Commits
feature-se
...
feature-ve
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
54586be06b | ||
|
|
17ff136e1f | ||
|
|
ddce646c7e | ||
|
|
27bc96806f | ||
|
|
dde52a6bc2 | ||
|
|
a5355010b0 | ||
|
|
8d3bb2ebea |
File diff suppressed because it is too large
Load Diff
@@ -1,121 +0,0 @@
|
||||
============================= test session starts ==============================
|
||||
platform linux -- Python 3.14.3, pytest-9.0.2, pluggy-1.6.0 -- /home/trenton/Documents/projects/paperless-ngx/.venv/bin/python
|
||||
cachedir: .pytest_cache
|
||||
django: version: 5.2.12, settings: paperless.settings (from ini)
|
||||
rootdir: /home/trenton/Documents/projects/paperless-ngx
|
||||
configfile: pyproject.toml
|
||||
plugins: sugar-1.1.1, xdist-3.8.0, cov-7.0.0, httpx-0.36.0, django-4.12.0, Faker-40.8.0, env-1.5.0, time-machine-3.2.0, mock-3.15.1, anyio-4.12.1, rerunfailures-16.1
|
||||
collecting ... collected 6 items
|
||||
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_relevance_search Creating test database for alias 'default'...
|
||||
|
||||
============================================================
|
||||
Profile: BEFORE — relevance search (no ordering)
|
||||
============================================================
|
||||
Wall time: 0.9622s
|
||||
Queries: 33 (0.0000s)
|
||||
Memory delta: 16557.2 KiB
|
||||
Peak memory: 16584.0 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
<frozen importlib._bootstrap_external>:511: size=5480 KiB (+5480 KiB), count=45642 (+45642), average=123 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/fido2/rpid.py:47: size=518 KiB (+518 KiB), count=9769 (+9769), average=54 B
|
||||
<frozen abc>:106: size=432 KiB (+432 KiB), count=1480 (+1480), average=299 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/langdetect/utils/ngram.py:257: size=391 KiB (+391 KiB), count=6667 (+6667), average=60 B
|
||||
<frozen importlib._bootstrap>:491: size=284 KiB (+284 KiB), count=2543 (+2543), average=114 B
|
||||
============================================================
|
||||
|
||||
PASSED
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_sorted_search
|
||||
============================================================
|
||||
Profile: BEFORE — sorted search (ordering=created)
|
||||
============================================================
|
||||
Wall time: 0.1320s
|
||||
Queries: 32 (0.0010s)
|
||||
Memory delta: 880.8 KiB
|
||||
Peak memory: 906.8 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B
|
||||
/home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.7 KiB (+49.7 KiB), count=315 (+315), average=162 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.0 KiB (+38.0 KiB), count=160 (+160), average=243 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filters.py:158: size=21.4 KiB (+21.4 KiB), count=104 (+104), average=210 B
|
||||
============================================================
|
||||
|
||||
PASSED
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_paginated_search
|
||||
============================================================
|
||||
Profile: BEFORE — paginated search (page=2, page_size=25)
|
||||
============================================================
|
||||
Wall time: 0.1395s
|
||||
Queries: 32 (0.0000s)
|
||||
Memory delta: 868.1 KiB
|
||||
Peak memory: 893.5 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B
|
||||
/home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.2 KiB (+49.2 KiB), count=315 (+315), average=160 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.1 KiB (+38.1 KiB), count=161 (+161), average=242 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filters.py:158: size=21.3 KiB (+21.3 KiB), count=104 (+104), average=209 B
|
||||
============================================================
|
||||
|
||||
PASSED
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_search_with_selection_data
|
||||
============================================================
|
||||
Profile: BEFORE — search with selection_data
|
||||
============================================================
|
||||
Wall time: 0.1656s
|
||||
Queries: 37 (0.0020s)
|
||||
Memory delta: 926.9 KiB
|
||||
Peak memory: 1084.3 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B
|
||||
/home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.6 KiB (+49.6 KiB), count=327 (+327), average=155 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.1 KiB (+38.1 KiB), count=161 (+161), average=242 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/operations.py:193: size=27.1 KiB (+27.1 KiB), count=37 (+37), average=751 B
|
||||
============================================================
|
||||
|
||||
PASSED
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_backend_search_only
|
||||
============================================================
|
||||
Profile: BEFORE — backend.search(page_size=10000, all highlights)
|
||||
============================================================
|
||||
Wall time: 0.0175s
|
||||
Queries: 0 (0.0000s)
|
||||
Memory delta: 88.6 KiB
|
||||
Peak memory: 100.3 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=51.2 KiB (+51.2 KiB), count=530 (+530), average=99 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:557: size=17.8 KiB (+17.8 KiB), count=200 (+200), average=91 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:542: size=8576 B (+8576 B), count=134 (+134), average=64 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:522: size=4800 B (+4800 B), count=200 (+200), average=24 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:515: size=2376 B (+2376 B), count=99 (+99), average=24 B
|
||||
============================================================
|
||||
|
||||
PASSED
|
||||
src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_backend_search_single_page
|
||||
============================================================
|
||||
Profile: BEFORE — backend.search(page_size=25)
|
||||
============================================================
|
||||
Wall time: 0.0070s
|
||||
Queries: 0 (0.0000s)
|
||||
Memory delta: 5.9 KiB
|
||||
Peak memory: 11.3 KiB
|
||||
|
||||
Top 5 allocations:
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:557: size=2275 B (+2275 B), count=25 (+25), average=91 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=1600 B (+1600 B), count=25 (+25), average=64 B
|
||||
/home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/weakref.py:73: size=1280 B (+1280 B), count=20 (+20), average=64 B
|
||||
/home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:574: size=256 B (+256 B), count=1 (+1), average=256 B
|
||||
/home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/tracemalloc.py:560: size=240 B (+240 B), count=1 (+1), average=240 B
|
||||
============================================================
|
||||
|
||||
PASSEDDestroying test database for alias 'default'...
|
||||
|
||||
|
||||
======================== 6 passed in 241.83s (0:04:01) =========================
|
||||
@@ -457,7 +457,7 @@ fields and permissions, which will be merged.
|
||||
|
||||
#### Types {#workflow-trigger-types}
|
||||
|
||||
Currently, there are four events that correspond to workflow trigger 'types':
|
||||
Currently, there are five events that correspond to workflow trigger 'types':
|
||||
|
||||
1. **Consumption Started**: _before_ a document is consumed, so events can include filters by source (mail, consumption
|
||||
folder or API), file path, file name, mail rule
|
||||
@@ -469,8 +469,10 @@ Currently, there are four events that correspond to workflow trigger 'types':
|
||||
4. **Scheduled**: a scheduled trigger that can be used to run workflows at a specific time. The date used can be either the document
|
||||
added, created, updated date or you can specify a (date) custom field. You can also specify a day offset from the date (positive
|
||||
offsets will trigger after the date, negative offsets will trigger before).
|
||||
5. **Version Added**: when a new version is added for an existing document. This trigger evaluates filters against the root document
|
||||
and applies actions to the root document.
|
||||
|
||||
The following flow diagram illustrates the four document trigger types:
|
||||
The following flow diagram illustrates the document trigger types:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
@@ -486,6 +488,10 @@ flowchart TD
|
||||
'Updated'
|
||||
trigger(s)"}
|
||||
|
||||
version{"Matching
|
||||
'Version Added'
|
||||
trigger(s)"}
|
||||
|
||||
scheduled{"Documents
|
||||
matching
|
||||
trigger(s)"}
|
||||
@@ -502,11 +508,15 @@ flowchart TD
|
||||
updated --> |Yes| J[Workflow Actions Run]
|
||||
updated --> |No| K
|
||||
J --> K[Document Saved]
|
||||
L[Scheduled Task Check<br/>hourly at :05] --> M[Get All Scheduled Triggers]
|
||||
M --> scheduled
|
||||
scheduled --> |Yes| N[Workflow Actions Run]
|
||||
scheduled --> |No| O[Document Saved]
|
||||
N --> O
|
||||
L[New Document Version Added] --> version
|
||||
version --> |Yes| V[Workflow Actions Run]
|
||||
version --> |No| W
|
||||
V --> W[Document Saved]
|
||||
X[Scheduled Task Check<br/>hourly at :05] --> Y[Get All Scheduled Triggers]
|
||||
Y --> scheduled
|
||||
scheduled --> |Yes| Z[Workflow Actions Run]
|
||||
scheduled --> |No| AA[Document Saved]
|
||||
Z --> AA
|
||||
```
|
||||
|
||||
#### Filters {#workflow-trigger-filters}
|
||||
|
||||
@@ -164,7 +164,7 @@
|
||||
<pngx-input-text i18n-title title="Filter path" formControlName="filter_path" horizontal="true" i18n-hint hint="Apply to documents that match this path. Wildcards specified as * are allowed. Case-normalized." [error]="error?.filter_path"></pngx-input-text>
|
||||
<pngx-input-select i18n-title title="Filter mail rule" [items]="mailRules" horizontal="true" [allowNull]="true" formControlName="filter_mailrule" i18n-hint hint="Apply to documents consumed via this mail rule." [error]="error?.filter_mailrule"></pngx-input-select>
|
||||
}
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled) {
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled || formGroup.get('type').value === WorkflowTriggerType.VersionAdded) {
|
||||
<pngx-input-select i18n-title title="Content matching algorithm" horizontal="true" [items]="getMatchingAlgorithms()" formControlName="matching_algorithm"></pngx-input-select>
|
||||
@if (matchingPatternRequired(formGroup)) {
|
||||
<pngx-input-text i18n-title title="Content matching pattern" horizontal="true" formControlName="match" [error]="error?.match"></pngx-input-text>
|
||||
@@ -175,7 +175,7 @@
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled) {
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled || formGroup.get('type').value === WorkflowTriggerType.VersionAdded) {
|
||||
<div class="row mt-3">
|
||||
<div class="col">
|
||||
<div class="trigger-filters mb-3">
|
||||
|
||||
@@ -120,6 +120,10 @@ export const WORKFLOW_TYPE_OPTIONS = [
|
||||
id: WorkflowTriggerType.Scheduled,
|
||||
name: $localize`Scheduled`,
|
||||
},
|
||||
{
|
||||
id: WorkflowTriggerType.VersionAdded,
|
||||
name: $localize`Version Added`,
|
||||
},
|
||||
]
|
||||
|
||||
export const WORKFLOW_ACTION_OPTIONS = [
|
||||
|
||||
@@ -12,6 +12,7 @@ export enum WorkflowTriggerType {
|
||||
DocumentAdded = 2,
|
||||
DocumentUpdated = 3,
|
||||
Scheduled = 4,
|
||||
VersionAdded = 5,
|
||||
}
|
||||
|
||||
export enum ScheduleDateField {
|
||||
|
||||
@@ -10,11 +10,13 @@ class DocumentsConfig(AppConfig):
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_updated
|
||||
from documents.signals import document_version_added
|
||||
from documents.signals.handlers import add_inbox_tags
|
||||
from documents.signals.handlers import add_or_update_document_in_llm_index
|
||||
from documents.signals.handlers import add_to_index
|
||||
from documents.signals.handlers import run_workflows_added
|
||||
from documents.signals.handlers import run_workflows_updated
|
||||
from documents.signals.handlers import run_workflows_version_added
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.signals.handlers import set_correspondent
|
||||
from documents.signals.handlers import set_document_type
|
||||
@@ -28,6 +30,7 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(set_storage_path)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_workflows_added)
|
||||
document_version_added.connect(run_workflows_version_added)
|
||||
document_consumption_finished.connect(add_or_update_document_in_llm_index)
|
||||
document_updated.connect(run_workflows_updated)
|
||||
document_updated.connect(send_websocket_document_updated)
|
||||
|
||||
@@ -44,6 +44,7 @@ from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals import document_version_added
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
@@ -576,6 +577,13 @@ class ConsumerPlugin(
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_version_added.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
|
||||
@@ -689,6 +689,7 @@ def document_matches_workflow(
|
||||
trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.SCHEDULED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED
|
||||
):
|
||||
trigger_matched, reason = existing_document_matches_workflow(
|
||||
document,
|
||||
|
||||
28
src/documents/migrations/0019_alter_workflowtrigger_type.py
Normal file
28
src/documents/migrations/0019_alter_workflowtrigger_type.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 5.2.7 on 2026-03-02 00:00
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0018_saved_view_simple_search_rules"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="workflowtrigger",
|
||||
name="type",
|
||||
field=models.PositiveSmallIntegerField(
|
||||
choices=[
|
||||
(1, "Consumption Started"),
|
||||
(2, "Document Added"),
|
||||
(3, "Document Updated"),
|
||||
(4, "Scheduled"),
|
||||
(5, "Version Added"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="Workflow Trigger Type",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -1183,6 +1183,7 @@ class WorkflowTrigger(models.Model):
|
||||
DOCUMENT_ADDED = 2, _("Document Added")
|
||||
DOCUMENT_UPDATED = 3, _("Document Updated")
|
||||
SCHEDULED = 4, _("Scheduled")
|
||||
VERSION_ADDED = 5, _("Version Added")
|
||||
|
||||
class DocumentSourceChoices(models.IntegerChoices):
|
||||
CONSUME_FOLDER = DocumentSource.ConsumeFolder.value, _("Consume Folder")
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
from documents.search._backend import SearchHit
|
||||
from documents.search._backend import SearchIndexLockError
|
||||
from documents.search._backend import SearchMode
|
||||
from documents.search._backend import SearchResults
|
||||
@@ -11,7 +10,6 @@ from documents.search._schema import needs_rebuild
|
||||
from documents.search._schema import wipe_index
|
||||
|
||||
__all__ = [
|
||||
"SearchHit",
|
||||
"SearchIndexLockError",
|
||||
"SearchMode",
|
||||
"SearchResults",
|
||||
|
||||
@@ -106,61 +106,27 @@ class SearchResults:
|
||||
|
||||
class TantivyRelevanceList:
|
||||
"""
|
||||
DRF-compatible list wrapper for Tantivy search results.
|
||||
DRF-compatible list wrapper for Tantivy search hits.
|
||||
|
||||
Holds a lightweight ordered list of IDs (for pagination count and
|
||||
``selection_data``) together with a small page of rich ``SearchHit``
|
||||
dicts (for serialization). DRF's ``PageNumberPagination`` calls
|
||||
``__len__`` to compute the total page count and ``__getitem__`` to
|
||||
slice the displayed page.
|
||||
Provides paginated access to search results while storing all hits in memory
|
||||
for efficient ID retrieval. Used by Django REST framework for pagination.
|
||||
|
||||
Args:
|
||||
ordered_ids: All matching document IDs in display order.
|
||||
page_hits: Rich SearchHit dicts for the requested DRF page only.
|
||||
page_offset: Index into *ordered_ids* where *page_hits* starts.
|
||||
Methods:
|
||||
__len__: Returns total hit count for pagination calculations
|
||||
__getitem__: Slices the hit list for page-specific results
|
||||
|
||||
Note: Stores ALL post-filter hits so get_all_result_ids() can return
|
||||
every matching document ID without requiring a second search query.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ordered_ids: list[int],
|
||||
page_hits: list[SearchHit],
|
||||
page_offset: int = 0,
|
||||
) -> None:
|
||||
self._ordered_ids = ordered_ids
|
||||
self._page_hits = page_hits
|
||||
self._page_offset = page_offset
|
||||
def __init__(self, hits: list[SearchHit]) -> None:
|
||||
self._hits = hits
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._ordered_ids)
|
||||
return len(self._hits)
|
||||
|
||||
def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
|
||||
if isinstance(key, int):
|
||||
idx = key if key >= 0 else len(self._ordered_ids) + key
|
||||
if self._page_offset <= idx < self._page_offset + len(self._page_hits):
|
||||
return self._page_hits[idx - self._page_offset]
|
||||
return SearchHit(
|
||||
id=self._ordered_ids[key],
|
||||
score=0.0,
|
||||
rank=idx + 1,
|
||||
highlights={},
|
||||
)
|
||||
start = key.start or 0
|
||||
stop = key.stop or len(self._ordered_ids)
|
||||
# DRF slices to extract the current page. If the slice aligns
|
||||
# with our pre-fetched page_hits, return them directly.
|
||||
if start == self._page_offset and stop <= self._page_offset + len(
|
||||
self._page_hits,
|
||||
):
|
||||
return self._page_hits[: stop - start]
|
||||
# Fallback: return stub dicts (no highlights).
|
||||
return [
|
||||
SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
|
||||
for i, doc_id in enumerate(self._ordered_ids[key])
|
||||
]
|
||||
|
||||
def get_all_ids(self) -> list[int]:
|
||||
"""Return all matching document IDs in display order."""
|
||||
return self._ordered_ids
|
||||
def __getitem__(self, key: slice) -> list[SearchHit]:
|
||||
return self._hits[key]
|
||||
|
||||
|
||||
class SearchIndexLockError(Exception):
|
||||
@@ -268,34 +234,6 @@ class TantivyBackend:
|
||||
the underlying index directory changes (e.g., during test isolation).
|
||||
"""
|
||||
|
||||
# Maps DRF ordering field names to Tantivy index field names.
|
||||
SORT_FIELD_MAP: dict[str, str] = {
|
||||
"title": "title_sort",
|
||||
"correspondent__name": "correspondent_sort",
|
||||
"document_type__name": "type_sort",
|
||||
"created": "created",
|
||||
"added": "added",
|
||||
"modified": "modified",
|
||||
"archive_serial_number": "asn",
|
||||
"page_count": "page_count",
|
||||
"num_notes": "num_notes",
|
||||
}
|
||||
|
||||
# Fields where Tantivy's sort order matches the ORM's sort order.
|
||||
# Text-based fields (title, correspondent__name, document_type__name)
|
||||
# are excluded because Tantivy's tokenized fast fields produce different
|
||||
# ordering than the ORM's collation-based ordering.
|
||||
SORTABLE_FIELDS: frozenset[str] = frozenset(
|
||||
{
|
||||
"created",
|
||||
"added",
|
||||
"modified",
|
||||
"archive_serial_number",
|
||||
"page_count",
|
||||
"num_notes",
|
||||
},
|
||||
)
|
||||
|
||||
def __init__(self, path: Path | None = None):
|
||||
# path=None → in-memory index (for tests)
|
||||
# path=some_dir → on-disk index (for production)
|
||||
@@ -334,36 +272,6 @@ class TantivyBackend:
|
||||
if self._index is None:
|
||||
self.open() # pragma: no cover
|
||||
|
||||
def _parse_query(
|
||||
self,
|
||||
query: str,
|
||||
search_mode: SearchMode,
|
||||
) -> tantivy.Query:
|
||||
"""Parse a user query string into a Tantivy Query object."""
|
||||
tz = get_current_timezone()
|
||||
if search_mode is SearchMode.TEXT:
|
||||
return parse_simple_text_query(self._index, query)
|
||||
elif search_mode is SearchMode.TITLE:
|
||||
return parse_simple_title_query(self._index, query)
|
||||
else:
|
||||
return parse_user_query(self._index, query, tz)
|
||||
|
||||
def _apply_permission_filter(
|
||||
self,
|
||||
query: tantivy.Query,
|
||||
user: AbstractBaseUser | None,
|
||||
) -> tantivy.Query:
|
||||
"""Wrap a query with a permission filter if the user is not a superuser."""
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
return tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
return query
|
||||
|
||||
def _build_tantivy_doc(
|
||||
self,
|
||||
document: Document,
|
||||
@@ -527,8 +435,6 @@ class TantivyBackend:
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
highlight_page: int | None = None,
|
||||
highlight_page_size: int | None = None,
|
||||
) -> SearchResults:
|
||||
"""
|
||||
Execute a search query against the document index.
|
||||
@@ -556,15 +462,45 @@ class TantivyBackend:
|
||||
SearchResults with hits, total count, and processed query
|
||||
"""
|
||||
self._ensure_open()
|
||||
user_query = self._parse_query(query, search_mode)
|
||||
final_query = self._apply_permission_filter(user_query, user)
|
||||
tz = get_current_timezone()
|
||||
if search_mode is SearchMode.TEXT:
|
||||
user_query = parse_simple_text_query(self._index, query)
|
||||
elif search_mode is SearchMode.TITLE:
|
||||
user_query = parse_simple_title_query(self._index, query)
|
||||
else:
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
# Apply permission filter if user is not None (not superuser)
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, user_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = user_query
|
||||
|
||||
searcher = self._index.searcher()
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
# Map sort fields
|
||||
sort_field_map = {
|
||||
"title": "title_sort",
|
||||
"correspondent__name": "correspondent_sort",
|
||||
"document_type__name": "type_sort",
|
||||
"created": "created",
|
||||
"added": "added",
|
||||
"modified": "modified",
|
||||
"archive_serial_number": "asn",
|
||||
"page_count": "page_count",
|
||||
"num_notes": "num_notes",
|
||||
}
|
||||
|
||||
# Perform search
|
||||
if sort_field and sort_field in self.SORT_FIELD_MAP:
|
||||
mapped_field = self.SORT_FIELD_MAP[sort_field]
|
||||
if sort_field and sort_field in sort_field_map:
|
||||
mapped_field = sort_field_map[sort_field]
|
||||
results = searcher.search(
|
||||
final_query,
|
||||
limit=offset + page_size,
|
||||
@@ -598,15 +534,6 @@ class TantivyBackend:
|
||||
snippet_generator = None
|
||||
notes_snippet_generator = None
|
||||
|
||||
# Determine which hits need highlights
|
||||
if highlight_page is not None and highlight_page_size is not None:
|
||||
hl_start = (highlight_page - 1) * highlight_page_size
|
||||
hl_end = hl_start + highlight_page_size
|
||||
else:
|
||||
# Highlight all hits (backward-compatible default)
|
||||
hl_start = 0
|
||||
hl_end = len(page_hits)
|
||||
|
||||
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
||||
# Get the actual document from the searcher using the doc address
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
@@ -615,9 +542,8 @@ class TantivyBackend:
|
||||
|
||||
highlights: dict[str, str] = {}
|
||||
|
||||
# Generate highlights if score > 0 and hit is in the highlight window
|
||||
hit_index = rank - offset - 1 # 0-based index within page_hits
|
||||
if score > 0 and hl_start <= hit_index < hl_end:
|
||||
# Generate highlights if score > 0
|
||||
if score > 0:
|
||||
try:
|
||||
if snippet_generator is None:
|
||||
snippet_generator = tantivy.SnippetGenerator.create(
|
||||
@@ -664,160 +590,6 @@ class TantivyBackend:
|
||||
query=query,
|
||||
)
|
||||
|
||||
def highlight_hits(
|
||||
self,
|
||||
query: str,
|
||||
doc_ids: list[int],
|
||||
*,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
) -> list[SearchHit]:
|
||||
"""
|
||||
Generate SearchHit dicts with highlights for specific document IDs.
|
||||
|
||||
Unlike search(), this does not execute a ranked query — it looks up
|
||||
each document by ID and generates snippets against the provided query.
|
||||
Use this when you already know which documents to display (from
|
||||
search_ids + ORM filtering) and just need highlight data.
|
||||
|
||||
Note: Each doc_id requires an individual index lookup because tantivy-py
|
||||
does not expose a batch doc-address-by-ID API. This is acceptable for
|
||||
page-sized batches (typically 25 docs) but should not be called with
|
||||
thousands of IDs.
|
||||
|
||||
Args:
|
||||
query: The search query (used for snippet generation)
|
||||
doc_ids: Ordered list of document IDs to generate hits for
|
||||
search_mode: Query parsing mode (for building the snippet query)
|
||||
|
||||
Returns:
|
||||
List of SearchHit dicts in the same order as doc_ids
|
||||
"""
|
||||
if not doc_ids:
|
||||
return []
|
||||
|
||||
self._ensure_open()
|
||||
user_query = self._parse_query(query, search_mode)
|
||||
|
||||
searcher = self._index.searcher()
|
||||
snippet_generator = None
|
||||
hits: list[SearchHit] = []
|
||||
|
||||
for rank, doc_id in enumerate(doc_ids, start=1):
|
||||
# Look up document by ID
|
||||
id_query = tantivy.Query.range_query(
|
||||
self._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
)
|
||||
results = searcher.search(id_query, limit=1)
|
||||
|
||||
if not results.hits:
|
||||
continue
|
||||
|
||||
doc_address = results.hits[0][1]
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
doc_dict = actual_doc.to_dict()
|
||||
|
||||
highlights: dict[str, str] = {}
|
||||
try:
|
||||
if snippet_generator is None:
|
||||
snippet_generator = tantivy.SnippetGenerator.create(
|
||||
searcher,
|
||||
user_query,
|
||||
self._schema,
|
||||
"content",
|
||||
)
|
||||
|
||||
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
|
||||
if content_snippet:
|
||||
highlights["content"] = str(content_snippet)
|
||||
|
||||
if "notes" in doc_dict:
|
||||
notes_generator = tantivy.SnippetGenerator.create(
|
||||
searcher,
|
||||
user_query,
|
||||
self._schema,
|
||||
"notes",
|
||||
)
|
||||
notes_snippet = notes_generator.snippet_from_doc(actual_doc)
|
||||
if notes_snippet:
|
||||
highlights["notes"] = str(notes_snippet)
|
||||
|
||||
except Exception: # pragma: no cover
|
||||
logger.debug("Failed to generate highlights for doc %s", doc_id)
|
||||
|
||||
hits.append(
|
||||
SearchHit(
|
||||
id=doc_id,
|
||||
score=0.0,
|
||||
rank=rank,
|
||||
highlights=highlights,
|
||||
),
|
||||
)
|
||||
|
||||
return hits
|
||||
|
||||
def search_ids(
|
||||
self,
|
||||
query: str,
|
||||
user: AbstractBaseUser | None,
|
||||
*,
|
||||
sort_field: str | None = None,
|
||||
sort_reverse: bool = False,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
limit: int | None = None,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Return document IDs matching a query — no highlights or scores.
|
||||
|
||||
This is the lightweight companion to search(). Use it when you need the
|
||||
full set of matching IDs (e.g. for ``selection_data``) but don't need
|
||||
scores, ranks, or highlights.
|
||||
|
||||
Args:
|
||||
query: User's search query
|
||||
user: User for permission filtering (None for superuser/no filtering)
|
||||
sort_field: Field to sort by (None for relevance ranking)
|
||||
sort_reverse: Whether to reverse the sort order
|
||||
search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
|
||||
limit: Maximum number of IDs to return (None = all matching docs)
|
||||
|
||||
Returns:
|
||||
List of document IDs in the requested order
|
||||
"""
|
||||
self._ensure_open()
|
||||
user_query = self._parse_query(query, search_mode)
|
||||
final_query = self._apply_permission_filter(user_query, user)
|
||||
|
||||
searcher = self._index.searcher()
|
||||
effective_limit = limit if limit is not None else searcher.num_docs
|
||||
|
||||
if sort_field and sort_field in self.SORT_FIELD_MAP:
|
||||
mapped_field = self.SORT_FIELD_MAP[sort_field]
|
||||
results = searcher.search(
|
||||
final_query,
|
||||
limit=effective_limit,
|
||||
order_by_field=mapped_field,
|
||||
order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
|
||||
)
|
||||
all_hits = [(hit[1],) for hit in results.hits]
|
||||
else:
|
||||
results = searcher.search(final_query, limit=effective_limit)
|
||||
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
||||
|
||||
# Normalize scores and apply threshold (relevance search only)
|
||||
if all_hits:
|
||||
max_score = max(hit[1] for hit in all_hits) or 1.0
|
||||
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
||||
|
||||
threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
|
||||
if threshold is not None:
|
||||
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
||||
|
||||
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
|
||||
|
||||
def autocomplete(
|
||||
self,
|
||||
term: str,
|
||||
@@ -851,7 +623,7 @@ class TantivyBackend:
|
||||
else:
|
||||
base_query = tantivy.Query.all_query()
|
||||
|
||||
results = searcher.search(base_query, limit=searcher.num_docs)
|
||||
results = searcher.search(base_query, limit=10000)
|
||||
|
||||
# Count how many visible documents each word appears in.
|
||||
# Using Counter (not set) preserves per-word document frequency so
|
||||
@@ -927,7 +699,17 @@ class TantivyBackend:
|
||||
boost_factor=None,
|
||||
)
|
||||
|
||||
final_query = self._apply_permission_filter(mlt_query, user)
|
||||
# Apply permission filter
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, mlt_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = mlt_query
|
||||
|
||||
# Search
|
||||
offset = (page - 1) * page_size
|
||||
@@ -971,66 +753,6 @@ class TantivyBackend:
|
||||
query=f"more_like:{doc_id}",
|
||||
)
|
||||
|
||||
def more_like_this_ids(
|
||||
self,
|
||||
doc_id: int,
|
||||
user: AbstractBaseUser | None,
|
||||
*,
|
||||
limit: int | None = None,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Return IDs of documents similar to the given document — no highlights.
|
||||
|
||||
Lightweight companion to more_like_this(). The original document is
|
||||
excluded from results.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the reference document
|
||||
user: User for permission filtering (None for no filtering)
|
||||
limit: Maximum number of IDs to return (None = all matching docs)
|
||||
|
||||
Returns:
|
||||
List of similar document IDs (excluding the original)
|
||||
"""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
id_query = tantivy.Query.range_query(
|
||||
self._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
)
|
||||
results = searcher.search(id_query, limit=1)
|
||||
|
||||
if not results.hits:
|
||||
return []
|
||||
|
||||
doc_address = results.hits[0][1]
|
||||
mlt_query = tantivy.Query.more_like_this_query(
|
||||
doc_address,
|
||||
min_doc_frequency=1,
|
||||
max_doc_frequency=None,
|
||||
min_term_frequency=1,
|
||||
max_query_terms=12,
|
||||
min_word_length=None,
|
||||
max_word_length=None,
|
||||
boost_factor=None,
|
||||
)
|
||||
|
||||
final_query = self._apply_permission_filter(mlt_query, user)
|
||||
|
||||
effective_limit = limit if limit is not None else searcher.num_docs
|
||||
results = searcher.search(final_query, limit=effective_limit)
|
||||
|
||||
ids = []
|
||||
for _score, doc_address in results.hits:
|
||||
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
|
||||
if result_doc_id != doc_id:
|
||||
ids.append(result_doc_id)
|
||||
return ids
|
||||
|
||||
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
||||
"""
|
||||
Get a batch context manager for bulk index operations.
|
||||
|
||||
@@ -3,3 +3,4 @@ from django.dispatch import Signal
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_updated = Signal()
|
||||
document_version_added = Signal()
|
||||
|
||||
@@ -814,6 +814,19 @@ def run_workflows_added(
|
||||
)
|
||||
|
||||
|
||||
def run_workflows_version_added(
|
||||
sender,
|
||||
document: Document,
|
||||
logging_group: uuid.UUID | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
run_workflows(
|
||||
trigger_type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
document=document.root_document,
|
||||
logging_group=logging_group,
|
||||
)
|
||||
|
||||
|
||||
def run_workflows_updated(
|
||||
sender,
|
||||
document: Document,
|
||||
|
||||
@@ -428,162 +428,6 @@ class TestSearch:
|
||||
== 0
|
||||
)
|
||||
|
||||
def test_highlight_page_only_highlights_requested_slice(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Only hits in the highlight_page slice should have non-empty highlights."""
|
||||
for i in range(6):
|
||||
doc = Document.objects.create(
|
||||
title=f"highlight doc {i}",
|
||||
content=f"searchable highlight content number {i}",
|
||||
checksum=f"HP{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
highlight_page=1,
|
||||
highlight_page_size=3,
|
||||
)
|
||||
assert r.total == 6
|
||||
assert len(r.hits) == 6
|
||||
highlighted = [h for h in r.hits if h["highlights"]]
|
||||
not_highlighted = [h for h in r.hits if not h["highlights"]]
|
||||
assert len(highlighted) == 3
|
||||
assert len(not_highlighted) == 3
|
||||
|
||||
def test_highlight_page_2_highlights_correct_slice(self, backend: TantivyBackend):
|
||||
"""highlight_page=2 should highlight only the second page of results."""
|
||||
for i in range(6):
|
||||
doc = Document.objects.create(
|
||||
title=f"page2 doc {i}",
|
||||
content=f"searchable page2 content number {i}",
|
||||
checksum=f"HP2{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
highlight_page=2,
|
||||
highlight_page_size=2,
|
||||
)
|
||||
assert r.total == 6
|
||||
assert len(r.hits) == 6
|
||||
highlighted = [h for h in r.hits if h["highlights"]]
|
||||
not_highlighted = [h for h in r.hits if not h["highlights"]]
|
||||
# Only 2 hits (the second page) should have highlights
|
||||
assert len(highlighted) == 2
|
||||
assert len(not_highlighted) == 4
|
||||
|
||||
def test_no_highlight_page_highlights_all(self, backend: TantivyBackend):
|
||||
"""When highlight_page is not specified, all hits get highlights (backward compat)."""
|
||||
for i in range(3):
|
||||
doc = Document.objects.create(
|
||||
title=f"compat doc {i}",
|
||||
content=f"searchable compat content {i}",
|
||||
checksum=f"HC{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert len(r.hits) == 3
|
||||
for hit in r.hits:
|
||||
assert "content" in hit["highlights"]
|
||||
|
||||
|
||||
class TestSearchIds:
|
||||
"""Test lightweight ID-only search."""
|
||||
|
||||
def test_returns_matching_ids(self, backend: TantivyBackend):
|
||||
"""search_ids must return IDs of all matching documents."""
|
||||
docs = []
|
||||
for i in range(5):
|
||||
doc = Document.objects.create(
|
||||
title=f"findable doc {i}",
|
||||
content="common keyword",
|
||||
checksum=f"SI{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
docs.append(doc)
|
||||
other = Document.objects.create(
|
||||
title="unrelated",
|
||||
content="nothing here",
|
||||
checksum="SI_other",
|
||||
)
|
||||
backend.add_or_update(other)
|
||||
|
||||
ids = backend.search_ids(
|
||||
"common keyword",
|
||||
user=None,
|
||||
search_mode=SearchMode.QUERY,
|
||||
)
|
||||
assert set(ids) == {d.pk for d in docs}
|
||||
assert other.pk not in ids
|
||||
|
||||
def test_respects_permission_filter(self, backend: TantivyBackend):
|
||||
"""search_ids must respect user permission filtering."""
|
||||
owner = User.objects.create_user("ids_owner")
|
||||
other = User.objects.create_user("ids_other")
|
||||
doc = Document.objects.create(
|
||||
title="private doc",
|
||||
content="secret keyword",
|
||||
checksum="SIP1",
|
||||
owner=owner,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert backend.search_ids(
|
||||
"secret",
|
||||
user=owner,
|
||||
search_mode=SearchMode.QUERY,
|
||||
) == [doc.pk]
|
||||
assert (
|
||||
backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == []
|
||||
)
|
||||
|
||||
def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings):
|
||||
"""search_ids must apply the same fuzzy threshold as search()."""
|
||||
doc = Document.objects.create(
|
||||
title="threshold test",
|
||||
content="unique term",
|
||||
checksum="SIT1",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1
|
||||
ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY)
|
||||
assert ids == []
|
||||
|
||||
def test_returns_ids_for_text_mode(self, backend: TantivyBackend):
|
||||
"""search_ids must work with TEXT search mode."""
|
||||
doc = Document.objects.create(
|
||||
title="text mode doc",
|
||||
content="findable phrase",
|
||||
checksum="SIM1",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT)
|
||||
assert ids == [doc.pk]
|
||||
|
||||
|
||||
class TestRebuild:
|
||||
"""Test index rebuilding functionality."""
|
||||
@@ -698,27 +542,6 @@ class TestMoreLikeThis:
|
||||
assert results.hits == []
|
||||
assert results.total == 0
|
||||
|
||||
def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
|
||||
"""more_like_this_ids must return IDs of similar documents, excluding the original."""
|
||||
doc1 = Document.objects.create(
|
||||
title="Important document",
|
||||
content="financial information report",
|
||||
checksum="MLTI1",
|
||||
pk=150,
|
||||
)
|
||||
doc2 = Document.objects.create(
|
||||
title="Another document",
|
||||
content="financial information report",
|
||||
checksum="MLTI2",
|
||||
pk=151,
|
||||
)
|
||||
backend.add_or_update(doc1)
|
||||
backend.add_or_update(doc2)
|
||||
|
||||
ids = backend.more_like_this_ids(doc_id=150, user=None)
|
||||
assert 150 not in ids
|
||||
assert 151 in ids
|
||||
|
||||
|
||||
class TestSingleton:
|
||||
"""Test get_backend() and reset_backend() singleton lifecycle."""
|
||||
|
||||
@@ -1503,89 +1503,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
[d2.id, d1.id, d3.id],
|
||||
)
|
||||
|
||||
def test_search_with_tantivy_native_sort(self) -> None:
|
||||
"""When ordering by a Tantivy-sortable field, results must be correctly sorted."""
|
||||
backend = get_backend()
|
||||
for i, asn in enumerate([30, 10, 20]):
|
||||
doc = Document.objects.create(
|
||||
title=f"sortable doc {i}",
|
||||
content="searchable content",
|
||||
checksum=f"TNS{i}",
|
||||
archive_serial_number=asn,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=searchable&ordering=archive_serial_number",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
asns = [doc["archive_serial_number"] for doc in response.data["results"]]
|
||||
self.assertEqual(asns, [10, 20, 30])
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=searchable&ordering=-archive_serial_number",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
asns = [doc["archive_serial_number"] for doc in response.data["results"]]
|
||||
self.assertEqual(asns, [30, 20, 10])
|
||||
|
||||
def test_search_page_2_returns_correct_slice(self) -> None:
|
||||
"""Page 2 must return the second slice, not overlap with page 1."""
|
||||
backend = get_backend()
|
||||
for i in range(10):
|
||||
doc = Document.objects.create(
|
||||
title=f"doc {i}",
|
||||
content="paginated content",
|
||||
checksum=f"PG2{i}",
|
||||
archive_serial_number=i + 1,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=paginated&ordering=archive_serial_number&page=1&page_size=3",
|
||||
)
|
||||
page1_ids = [r["id"] for r in response.data["results"]]
|
||||
self.assertEqual(len(page1_ids), 3)
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=paginated&ordering=archive_serial_number&page=2&page_size=3",
|
||||
)
|
||||
page2_ids = [r["id"] for r in response.data["results"]]
|
||||
self.assertEqual(len(page2_ids), 3)
|
||||
|
||||
# No overlap between pages
|
||||
self.assertEqual(set(page1_ids) & set(page2_ids), set())
|
||||
# Page 2 ASNs are higher than page 1
|
||||
page1_asns = [
|
||||
Document.objects.get(pk=pk).archive_serial_number for pk in page1_ids
|
||||
]
|
||||
page2_asns = [
|
||||
Document.objects.get(pk=pk).archive_serial_number for pk in page2_ids
|
||||
]
|
||||
self.assertTrue(max(page1_asns) < min(page2_asns))
|
||||
|
||||
def test_search_all_field_contains_all_ids_when_paginated(self) -> None:
|
||||
"""The 'all' field must contain every matching ID, even when paginated."""
|
||||
backend = get_backend()
|
||||
doc_ids = []
|
||||
for i in range(10):
|
||||
doc = Document.objects.create(
|
||||
title=f"all field doc {i}",
|
||||
content="allfield content",
|
||||
checksum=f"AF{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
doc_ids.append(doc.pk)
|
||||
|
||||
response = self.client.get(
|
||||
"/api/documents/?query=allfield&page=1&page_size=3",
|
||||
headers={"Accept": "application/json; version=9"},
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(len(response.data["results"]), 3)
|
||||
# "all" must contain ALL 10 matching IDs
|
||||
self.assertCountEqual(response.data["all"], doc_ids)
|
||||
|
||||
@mock.patch("documents.bulk_edit.bulk_update_documents")
|
||||
def test_global_search(self, m) -> None:
|
||||
"""
|
||||
|
||||
@@ -720,9 +720,16 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@mock.patch("documents.consumer.document_updated.send")
|
||||
@mock.patch("documents.consumer.document_version_added.send")
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def test_consume_version_creates_new_version(self, m) -> None:
|
||||
m.return_value = MagicMock()
|
||||
def test_consume_version_creates_new_version(
|
||||
self,
|
||||
mock_load_classifier: mock.Mock,
|
||||
mock_document_version_added_send: mock.Mock,
|
||||
mock_document_updated_send: mock.Mock,
|
||||
) -> None:
|
||||
mock_load_classifier.return_value = MagicMock()
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
@@ -790,6 +797,16 @@ class TestConsumer(
|
||||
self.assertIsNone(version.archive_serial_number)
|
||||
self.assertEqual(version.original_filename, version_file.name)
|
||||
self.assertTrue(bool(version.content))
|
||||
mock_document_version_added_send.assert_called_once()
|
||||
self.assertEqual(
|
||||
mock_document_version_added_send.call_args.kwargs["document"].id,
|
||||
version.id,
|
||||
)
|
||||
mock_document_updated_send.assert_called_once()
|
||||
self.assertEqual(
|
||||
mock_document_updated_send.call_args.kwargs["document"].id,
|
||||
root_doc.id,
|
||||
)
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
|
||||
@@ -61,6 +61,7 @@ from documents.models import WorkflowTrigger
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.serialisers import WorkflowTriggerSerializer
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_version_added
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
@@ -1902,6 +1903,53 @@ class TestWorkflows(
|
||||
).exists(),
|
||||
)
|
||||
|
||||
def test_version_added_workflow_runs_on_root_document(self) -> None:
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
)
|
||||
action = WorkflowAction.objects.create(
|
||||
assign_title="Updated by version",
|
||||
assign_owner=self.user2,
|
||||
)
|
||||
workflow = Workflow.objects.create(
|
||||
name="Version workflow",
|
||||
order=0,
|
||||
)
|
||||
workflow.triggers.add(trigger)
|
||||
workflow.actions.add(action)
|
||||
|
||||
root_doc = Document.objects.create(
|
||||
title="root",
|
||||
correspondent=self.c,
|
||||
original_filename="root.pdf",
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="version",
|
||||
correspondent=self.c,
|
||||
original_filename="version.pdf",
|
||||
root_document=root_doc,
|
||||
)
|
||||
|
||||
document_version_added.send(
|
||||
sender=self.__class__,
|
||||
document=version_doc,
|
||||
)
|
||||
|
||||
root_doc.refresh_from_db()
|
||||
version_doc.refresh_from_db()
|
||||
|
||||
self.assertEqual(root_doc.title, "Updated by version")
|
||||
self.assertEqual(root_doc.owner, self.user2)
|
||||
self.assertIsNone(version_doc.owner)
|
||||
self.assertEqual(
|
||||
WorkflowRun.objects.filter(
|
||||
workflow=workflow,
|
||||
type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
document=root_doc,
|
||||
).count(),
|
||||
1,
|
||||
)
|
||||
|
||||
def test_document_updated_workflow(self) -> None:
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED,
|
||||
|
||||
@@ -2058,14 +2058,13 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
if not self._is_search_request():
|
||||
return super().list(request)
|
||||
|
||||
from documents.search import SearchHit
|
||||
from documents.search import SearchMode
|
||||
from documents.search import TantivyBackend
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
|
||||
try:
|
||||
backend = get_backend()
|
||||
# ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied)
|
||||
filtered_qs = self.filter_queryset(self.get_queryset())
|
||||
|
||||
user = None if request.user.is_superuser else request.user
|
||||
@@ -2080,28 +2079,6 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
},
|
||||
)
|
||||
|
||||
# Parse ordering param
|
||||
ordering_param = request.query_params.get("ordering", "")
|
||||
sort_reverse = ordering_param.startswith("-")
|
||||
sort_field_name = ordering_param.lstrip("-") or None
|
||||
|
||||
use_tantivy_sort = (
|
||||
sort_field_name in TantivyBackend.SORTABLE_FIELDS
|
||||
or sort_field_name is None
|
||||
)
|
||||
|
||||
# Compute the DRF page so we can tell Tantivy which slice to highlight
|
||||
try:
|
||||
requested_page = int(request.query_params.get("page", 1))
|
||||
except (TypeError, ValueError):
|
||||
requested_page = 1
|
||||
try:
|
||||
requested_page_size = int(
|
||||
request.query_params.get("page_size", self.paginator.page_size),
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
requested_page_size = self.paginator.page_size
|
||||
|
||||
if (
|
||||
"text" in request.query_params
|
||||
or "title_search" in request.query_params
|
||||
@@ -2116,44 +2093,17 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
else:
|
||||
search_mode = SearchMode.QUERY
|
||||
query_str = request.query_params["query"]
|
||||
|
||||
# Step 1: Get all matching IDs (lightweight, no highlights)
|
||||
all_ids = backend.search_ids(
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
sort_field=sort_field_name if use_tantivy_sort else None,
|
||||
sort_reverse=sort_reverse,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
|
||||
# Step 2: Intersect with ORM-visible IDs (field filters)
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
|
||||
if use_tantivy_sort:
|
||||
# Fast path: Tantivy already ordered the IDs
|
||||
ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
|
||||
else:
|
||||
# Slow path: ORM must re-sort
|
||||
id_set = set(all_ids) & orm_ids
|
||||
ordered_ids = list(
|
||||
filtered_qs.filter(id__in=id_set).values_list(
|
||||
"pk",
|
||||
flat=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Step 3: Fetch highlights for the displayed page only
|
||||
page_offset = (requested_page - 1) * requested_page_size
|
||||
page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
|
||||
|
||||
page_hits = backend.highlight_hits(
|
||||
query_str,
|
||||
page_ids,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
|
||||
else:
|
||||
# more_like_id path
|
||||
# more_like_id — validate permission on the seed document first
|
||||
try:
|
||||
more_like_doc_id = int(request.query_params["more_like_id"])
|
||||
more_like_doc = Document.objects.select_related("owner").get(
|
||||
@@ -2169,24 +2119,33 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
):
|
||||
raise PermissionDenied(_("Insufficient permissions."))
|
||||
|
||||
# Step 1: Get all matching IDs (lightweight)
|
||||
all_ids = backend.more_like_this_ids(
|
||||
results = backend.more_like_this(
|
||||
more_like_doc_id,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
)
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
ordered_ids = [doc_id for doc_id in all_ids if doc_id in orm_ids]
|
||||
|
||||
# Step 2: Build hit dicts for the displayed page
|
||||
# MLT has no text query, so no highlights needed
|
||||
page_offset = (requested_page - 1) * requested_page_size
|
||||
page_ids = ordered_ids[page_offset : page_offset + requested_page_size]
|
||||
page_hits = [
|
||||
SearchHit(id=doc_id, score=0.0, rank=rank, highlights={})
|
||||
for rank, doc_id in enumerate(page_ids, start=page_offset + 1)
|
||||
hits_by_id = {h["id"]: h for h in results.hits}
|
||||
|
||||
# Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order
|
||||
ordering_param = request.query_params.get("ordering", "").lstrip("-")
|
||||
if not ordering_param:
|
||||
# Preserve Tantivy relevance order; intersect with ORM-visible IDs
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
|
||||
else:
|
||||
# Use ORM ordering (already applied by DocumentsOrderingFilter)
|
||||
hit_ids = set(hits_by_id.keys())
|
||||
orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
|
||||
"pk",
|
||||
flat=True,
|
||||
)
|
||||
ordered_hits = [
|
||||
hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id
|
||||
]
|
||||
|
||||
rl = TantivyRelevanceList(ordered_ids, page_hits, page_offset)
|
||||
rl = TantivyRelevanceList(ordered_hits)
|
||||
page = self.paginate_queryset(rl)
|
||||
|
||||
if page is not None:
|
||||
@@ -2196,14 +2155,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
if get_boolean(
|
||||
str(request.query_params.get("include_selection_data", "false")),
|
||||
):
|
||||
all_ids = [h["id"] for h in ordered_hits]
|
||||
response.data["selection_data"] = (
|
||||
self._get_selection_data_for_queryset(
|
||||
filtered_qs.filter(pk__in=ordered_ids),
|
||||
filtered_qs.filter(pk__in=all_ids),
|
||||
)
|
||||
)
|
||||
return response
|
||||
|
||||
serializer = self.get_serializer(page_hits, many=True)
|
||||
serializer = self.get_serializer(ordered_hits, many=True)
|
||||
return Response(serializer.data)
|
||||
|
||||
except NotFound:
|
||||
|
||||
@@ -89,7 +89,7 @@ class StandardPagination(PageNumberPagination):
|
||||
|
||||
query = self.page.paginator.object_list
|
||||
if isinstance(query, TantivyRelevanceList):
|
||||
return query.get_all_ids()
|
||||
return [h["id"] for h in query._hits]
|
||||
return self.page.paginator.object_list.values_list("pk", flat=True)
|
||||
|
||||
def get_paginated_response_schema(self, schema):
|
||||
|
||||
Reference in New Issue
Block a user