From aed9abe48c02b1b7df07e429db3f2ed8c3a58ce8 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:38:22 -0700 Subject: [PATCH] Feature: Replace Whoosh with tantivy search backend (#12471) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Claude Sonnet 4.6 Co-authored-by: Antoine Mérino <3023499+Merinorus@users.noreply.github.com> --- .../s6-overlay/s6-rc.d/init-search-index/run | 28 +- docs/administration.md | 47 +- docs/api.md | 5 +- docs/configuration.md | 26 + docs/migration-v3.md | 31 + docs/usage.md | 84 +- pyproject.toml | 4 +- src/documents/admin.py | 15 +- src/documents/bulk_edit.py | 6 +- src/documents/index.py | 675 -------------- .../management/commands/document_index.py | 57 +- ...7_migrate_fulltext_query_field_prefixes.py | 39 + src/documents/models.py | 33 +- src/documents/sanity_checker.py | 17 +- src/documents/search/__init__.py | 21 + src/documents/search/_backend.py | 858 ++++++++++++++++++ src/documents/search/_query.py | 497 ++++++++++ src/documents/search/_schema.py | 165 ++++ src/documents/search/_tokenizer.py | 116 +++ src/documents/serialisers.py | 18 +- src/documents/signals/handlers.py | 13 +- src/documents/tasks.py | 45 +- src/documents/tests/conftest.py | 21 + src/documents/tests/search/__init__.py | 0 src/documents/tests/search/conftest.py | 33 + src/documents/tests/search/test_backend.py | 502 ++++++++++ ...migration_fulltext_query_field_prefixes.py | 138 +++ src/documents/tests/search/test_query.py | 530 +++++++++++ src/documents/tests/search/test_schema.py | 63 ++ src/documents/tests/search/test_tokenizer.py | 78 ++ src/documents/tests/test_admin.py | 33 +- .../tests/test_api_document_versions.py | 37 +- src/documents/tests/test_api_search.py | 421 +++++---- src/documents/tests/test_api_status.py | 30 +- src/documents/tests/test_delayedquery.py | 58 -- src/documents/tests/test_index.py | 371 -------- src/documents/tests/test_management.py | 73 +- src/documents/tests/test_matchables.py | 6 + src/documents/tests/test_tag_hierarchy.py | 4 +- src/documents/tests/test_task_signals.py | 30 +- src/documents/tests/test_tasks.py | 25 +- src/documents/tests/test_workflows.py | 1 + src/documents/tests/utils.py | 6 + src/documents/utils.py | 13 + src/documents/views.py | 236 ++--- src/paperless/settings/__init__.py | 51 ++ src/paperless/settings/parsers.py | 2 +- .../parsers/test_tesseract_custom_settings.py | 5 + src/paperless/tests/settings/test_settings.py | 45 + src/paperless/views.py | 19 +- src/paperless_ai/indexing.py | 14 +- uv.lock | 113 ++- 52 files changed, 4050 insertions(+), 1708 deletions(-) delete mode 100644 src/documents/index.py create mode 100644 src/documents/migrations/0017_migrate_fulltext_query_field_prefixes.py create mode 100644 src/documents/search/__init__.py create mode 100644 src/documents/search/_backend.py create mode 100644 src/documents/search/_query.py create mode 100644 src/documents/search/_schema.py create mode 100644 src/documents/search/_tokenizer.py create mode 100644 src/documents/tests/search/__init__.py create mode 100644 src/documents/tests/search/conftest.py create mode 100644 src/documents/tests/search/test_backend.py create mode 100644 src/documents/tests/search/test_migration_fulltext_query_field_prefixes.py create mode 100644 src/documents/tests/search/test_query.py create mode 100644 src/documents/tests/search/test_schema.py create mode 100644 src/documents/tests/search/test_tokenizer.py delete mode 100644 src/documents/tests/test_delayedquery.py delete mode 100644 src/documents/tests/test_index.py diff --git a/docker/rootfs/etc/s6-overlay/s6-rc.d/init-search-index/run b/docker/rootfs/etc/s6-overlay/s6-rc.d/init-search-index/run index 2208faf67..8f6feeb7f 100755 --- a/docker/rootfs/etc/s6-overlay/s6-rc.d/init-search-index/run +++ b/docker/rootfs/etc/s6-overlay/s6-rc.d/init-search-index/run @@ -3,26 +3,10 @@ declare -r log_prefix="[init-index]" -declare -r index_version=9 -declare -r data_dir="${PAPERLESS_DATA_DIR:-/usr/src/paperless/data}" -declare -r index_version_file="${data_dir}/.index_version" - -update_index () { - echo "${log_prefix} Search index out of date. Updating..." - cd "${PAPERLESS_SRC_DIR}" - if [[ -n "${USER_IS_NON_ROOT}" ]]; then - python3 manage.py document_index reindex --no-progress-bar - echo ${index_version} | tee "${index_version_file}" > /dev/null - else - s6-setuidgid paperless python3 manage.py document_index reindex --no-progress-bar - echo ${index_version} | s6-setuidgid paperless tee "${index_version_file}" > /dev/null - fi -} - -if [[ (! -f "${index_version_file}") ]]; then - echo "${log_prefix} No index version file found" - update_index -elif [[ $(<"${index_version_file}") != "$index_version" ]]; then - echo "${log_prefix} index version updated" - update_index +echo "${log_prefix} Checking search index..." +cd "${PAPERLESS_SRC_DIR}" +if [[ -n "${USER_IS_NON_ROOT}" ]]; then + python3 manage.py document_index reindex --if-needed --no-progress-bar +else + s6-setuidgid paperless python3 manage.py document_index reindex --if-needed --no-progress-bar fi diff --git a/docs/administration.md b/docs/administration.md index e55b899f5..013ac9fdd 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -180,6 +180,16 @@ following: This might not actually do anything. Not every new paperless version comes with new database migrations. +4. Rebuild the search index if needed. + + ```shell-session + cd src + python3 manage.py document_index reindex --if-needed + ``` + + This is a no-op if the index is already up to date, so it is safe to + run on every upgrade. + ### Database Upgrades Paperless-ngx is compatible with Django-supported versions of PostgreSQL and MariaDB and it is generally @@ -453,17 +463,42 @@ the search yields non-existing documents or won't find anything, you may need to recreate the index manually. ``` -document_index {reindex,optimize} +document_index {reindex,optimize} [--recreate] [--if-needed] ``` -Specify `reindex` to have the index created from scratch. This may take -some time. +Specify `reindex` to rebuild the index from all documents in the database. This +may take some time. -Specify `optimize` to optimize the index. This updates certain aspects -of the index and usually makes queries faster and also ensures that the -autocompletion works properly. This command is regularly invoked by the +Pass `--recreate` to wipe the existing index before rebuilding. Use this when the +index is corrupted or you want a fully clean rebuild. + +Pass `--if-needed` to skip the rebuild if the index is already up to date (schema +version and search language match). Safe to run on every startup or upgrade. + +Specify `optimize` to optimize the index. This command is regularly invoked by the task scheduler. +!!! note + + The `optimize` subcommand is deprecated and is now a no-op. Tantivy manages + segment merging automatically; no manual optimization step is needed. + +!!! note + + **Docker users:** On every startup, the container runs + `document_index reindex --if-needed` automatically. Schema changes, language + changes, and missing indexes are all detected and rebuilt before the webserver + starts. No manual step is required. + + **Bare metal users:** Run the following command after each upgrade (and after + changing `PAPERLESS_SEARCH_LANGUAGE`). It is a no-op if the index is already + up to date: + + ```shell-session + cd src + python3 manage.py document_index reindex --if-needed + ``` + ### Clearing the database read cache If the database read cache is enabled, **you must run this command** after making any changes to the database outside the application context. diff --git a/docs/api.md b/docs/api.md index 21c6b140f..2284d9d29 100644 --- a/docs/api.md +++ b/docs/api.md @@ -167,9 +167,8 @@ Query parameters: - `term`: The incomplete term. - `limit`: Amount of results. Defaults to 10. -Results returned by the endpoint are ordered by importance of the term -in the document index. The first result is the term that has the highest -[Tf/Idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) score in the index. +Results are ordered by how many of the user's visible documents contain +each matching word. The first result is the word that appears in the most documents. ```json ["term1", "term3", "term6", "term4"] diff --git a/docs/configuration.md b/docs/configuration.md index 4ce2d9dc6..a22171ce9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1103,6 +1103,32 @@ should be a valid crontab(5) expression describing when to run. Defaults to `0 0 * * *` or daily at midnight. +#### [`PAPERLESS_SEARCH_LANGUAGE=`](#PAPERLESS_SEARCH_LANGUAGE) {#PAPERLESS_SEARCH_LANGUAGE} + +: Sets the stemmer language for the full-text search index. +Stemming improves recall by matching word variants (e.g. "running" matches "run"). +Changing this setting causes the index to be rebuilt automatically on next startup. +An invalid value raises an error at startup. + +: Use the ISO 639-1 two-letter code (e.g. `en`, `de`, `fr`). Lowercase full names +(e.g. `english`, `german`, `french`) are also accepted. The capitalized names shown +in the [Tantivy Language enum](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) +documentation are **not** valid — use the lowercase equivalent. + +: If not set, paperless infers the language from +[`PAPERLESS_OCR_LANGUAGE`](#PAPERLESS_OCR_LANGUAGE). If the OCR language has no +Tantivy stemmer equivalent, stemming is disabled. + + Defaults to unset (inferred from `PAPERLESS_OCR_LANGUAGE`). + +#### [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD=`](#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD) {#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD} + +: When set to a float value, approximate/fuzzy matching is applied alongside exact +matching. Fuzzy results rank below exact matches. A value of `0.5` is a reasonable +starting point. Leave unset to disable fuzzy matching entirely. + + Defaults to unset (disabled). + #### [`PAPERLESS_SANITY_TASK_CRON=`](#PAPERLESS_SANITY_TASK_CRON) {#PAPERLESS_SANITY_TASK_CRON} : Configures the scheduled sanity checker frequency. The value should be a diff --git a/docs/migration-v3.md b/docs/migration-v3.md index 4c728a6a4..1cfb212ff 100644 --- a/docs/migration-v3.md +++ b/docs/migration-v3.md @@ -104,6 +104,37 @@ Multiple options are combined in a single value: PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10" ``` +## Search Index (Whoosh -> Tantivy) + +The full-text search backend has been replaced with [Tantivy](https://github.com/quickwit-oss/tantivy). +The index format is incompatible with Whoosh, so **the search index is automatically rebuilt from +scratch on first startup after upgrading**. No manual action is required for the rebuild itself. + +### Note and custom field search syntax + +The old Whoosh index exposed `note` and `custom_field` as flat text fields that were included in +unqualified searches (e.g. just typing `invoice` would match note content). With Tantivy these are +now structured JSON fields accessed via dotted paths: + +| Old syntax | New syntax | +| -------------------- | --------------------------- | +| `note:query` | `notes.note:query` | +| `custom_field:query` | `custom_fields.value:query` | + +**Saved views are migrated automatically.** Any saved view filter rule that used an explicit +`note:` or `custom_field:` field prefix in a fulltext query is rewritten to the new syntax by a +data migration that runs on upgrade. + +**Unqualified queries are not migrated.** If you had a saved view with a plain search term (e.g. +`invoice`) that happened to match note content or custom field values, it will no longer return +those matches. Update those queries to use the explicit prefix, for example: + +``` +invoice OR notes.note:invoice OR custom_fields.value:invoice +``` + +Custom field names can also be searched with `custom_fields.name:fieldname`. + ## OpenID Connect Token Endpoint Authentication Some existing OpenID Connect setups may require an explicit token endpoint authentication method after upgrading to v3. diff --git a/docs/usage.md b/docs/usage.md index 6da6c4d77..4e2def93b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -804,13 +804,20 @@ contract you signed 8 years ago). When you search paperless for a document, it tries to match this query against your documents. Paperless will look for matching documents by -inspecting their content, title, correspondent, type and tags. Paperless -returns a scored list of results, so that documents matching your query -better will appear further up in the search results. +inspecting their content, title, correspondent, type, tags, notes, and +custom field values. Paperless returns a scored list of results, so that +documents matching your query better will appear further up in the search +results. By default, paperless returns only documents which contain all words -typed in the search bar. However, paperless also offers advanced search -syntax if you want to drill down the results further. +typed in the search bar. A few things to know about how matching works: + +- **Word-order-independent**: "invoice unpaid" and "unpaid invoice" return the same results. +- **Accent-insensitive**: searching `resume` also finds `résumé`, `cafe` finds `café`. +- **Separator-agnostic**: punctuation and separators are stripped during indexing, so + searching a partial number like `1312` finds documents containing `A-1312/B`. + +Paperless also offers advanced search syntax if you want to drill down further. Matching documents with logical expressions: @@ -839,18 +846,69 @@ Matching inexact words: produ*name ``` +Matching natural date keywords: + +``` +added:today +modified:yesterday +created:this_week +added:last_month +modified:this_year +``` + +Supported date keywords: `today`, `yesterday`, `this_week`, `last_week`, +`this_month`, `last_month`, `this_year`, `last_year`. + +#### Searching custom fields + +Custom field values are included in the full-text index, so a plain search +already matches documents whose custom field values contain your search terms. +To narrow by field name or value specifically: + +``` +custom_fields.value:policy +custom_fields.name:"Contract Number" +custom_fields.name:Insurance custom_fields.value:policy +``` + +- `custom_fields.value` matches against the value of any custom field. +- `custom_fields.name` matches the name of the field (use quotes for multi-word names). +- Combine both to find documents where a specific named field contains a specific value. + +Because separators are stripped during indexing, individual parts of formatted +codes are searchable on their own. A value stored as `A-1312/99.50` produces the +tokens `a`, `1312`, `99`, `50` — each searchable independently: + +``` +custom_fields.value:1312 +custom_fields.name:"Contract Number" custom_fields.value:1312 +``` + !!! note - Inexact terms are hard for search indexes. These queries might take a - while to execute. That's why paperless offers auto complete and query - correction. + Custom date fields do not support relative date syntax (e.g. `[now to 2 weeks]`). + For date ranges on custom date fields, use the document list filters in the web UI. + +#### Searching notes + +Notes content is included in full-text search automatically. To search +by note author or content specifically: + +``` +notes.user:alice +notes.note:reminder +notes.user:alice notes.note:insurance +``` All of these constructs can be combined as you see fit. If you want to -learn more about the query language used by paperless, paperless uses -Whoosh's default query language. Head over to [Whoosh query -language](https://whoosh.readthedocs.io/en/latest/querylang.html). For -details on what date parsing utilities are available, see [Date -parsing](https://whoosh.readthedocs.io/en/latest/dates.html#parsing-date-queries). +learn more about the query language used by paperless, see the +[Tantivy query language documentation](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html). + +!!! note + + Fuzzy (approximate) matching can be enabled by setting + [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD`](configuration.md#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD). + When enabled, paperless will include near-miss results ranked below exact matches. ## Keyboard shortcuts / hotkeys diff --git a/pyproject.toml b/pyproject.toml index e37a7958f..5af886f0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,11 +74,11 @@ dependencies = [ "scikit-learn~=1.8.0", "sentence-transformers>=4.1", "setproctitle~=1.3.4", + "tantivy>=0.25.1", "tika-client~=0.10.0", "torch~=2.10.0", "watchfiles>=1.1.1", "whitenoise~=6.11", - "whoosh-reloaded>=2.7.5", "zxing-cpp~=3.0.0", ] [project.optional-dependencies] @@ -123,6 +123,7 @@ testing = [ "pytest-rerunfailures~=16.1", "pytest-sugar", "pytest-xdist~=3.8.0", + "time-machine>=2.13", ] typing = [ "celery-types", @@ -310,6 +311,7 @@ markers = [ "greenmail: Tests requiring Greenmail service", "date_parsing: Tests which cover date parsing from content or filename", "management: Tests which cover management commands/functionality", + "search: Tests for the Tantivy search backend", ] [tool.pytest_env] diff --git a/src/documents/admin.py b/src/documents/admin.py index 6c7a6f304..f0e5ccd25 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -100,24 +100,23 @@ class DocumentAdmin(GuardedModelAdmin): return Document.global_objects.all() def delete_queryset(self, request, queryset): - from documents import index + from documents.search import get_backend - with index.open_index_writer() as writer: + with get_backend().batch_update() as batch: for o in queryset: - index.remove_document(writer, o) - + batch.remove(o.pk) super().delete_queryset(request, queryset) def delete_model(self, request, obj): - from documents import index + from documents.search import get_backend - index.remove_document_from_index(obj) + get_backend().remove(obj.pk) super().delete_model(request, obj) def save_model(self, request, obj, form, change): - from documents import index + from documents.search import get_backend - index.add_or_update_document(obj) + get_backend().add_or_update(obj) super().save_model(request, obj, form, change) diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index 8dbcdb8a4..3f80b699d 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -349,11 +349,11 @@ def delete(doc_ids: list[int]) -> Literal["OK"]: Document.objects.filter(id__in=delete_ids).delete() - from documents import index + from documents.search import get_backend - with index.open_index_writer() as writer: + with get_backend().batch_update() as batch: for id in delete_ids: - index.remove_document_by_id(writer, id) + batch.remove(id) status_mgr = DocumentsStatusManager() status_mgr.send_documents_deleted(delete_ids) diff --git a/src/documents/index.py b/src/documents/index.py deleted file mode 100644 index 24d74d8c1..000000000 --- a/src/documents/index.py +++ /dev/null @@ -1,675 +0,0 @@ -from __future__ import annotations - -import logging -import math -import re -from collections import Counter -from contextlib import contextmanager -from datetime import UTC -from datetime import datetime -from datetime import time -from datetime import timedelta -from shutil import rmtree -from time import sleep -from typing import TYPE_CHECKING -from typing import Literal - -from dateutil.relativedelta import relativedelta -from django.conf import settings -from django.utils import timezone as django_timezone -from django.utils.timezone import get_current_timezone -from django.utils.timezone import now -from guardian.shortcuts import get_users_with_perms -from whoosh import classify -from whoosh import highlight -from whoosh import query -from whoosh.fields import BOOLEAN -from whoosh.fields import DATETIME -from whoosh.fields import KEYWORD -from whoosh.fields import NUMERIC -from whoosh.fields import TEXT -from whoosh.fields import Schema -from whoosh.highlight import HtmlFormatter -from whoosh.idsets import BitSet -from whoosh.idsets import DocIdSet -from whoosh.index import FileIndex -from whoosh.index import LockError -from whoosh.index import create_in -from whoosh.index import exists_in -from whoosh.index import open_dir -from whoosh.qparser import MultifieldParser -from whoosh.qparser import QueryParser -from whoosh.qparser.dateparse import DateParserPlugin -from whoosh.qparser.dateparse import English -from whoosh.qparser.plugins import FieldsPlugin -from whoosh.scoring import TF_IDF -from whoosh.util.times import timespan -from whoosh.writing import AsyncWriter - -from documents.models import CustomFieldInstance -from documents.models import Document -from documents.models import Note -from documents.models import User - -if TYPE_CHECKING: - from django.db.models import QuerySet - from whoosh.reading import IndexReader - from whoosh.searching import ResultsPage - from whoosh.searching import Searcher - -logger = logging.getLogger("paperless.index") - - -def get_schema() -> Schema: - return Schema( - id=NUMERIC(stored=True, unique=True), - title=TEXT(sortable=True), - content=TEXT(), - asn=NUMERIC(sortable=True, signed=False), - correspondent=TEXT(sortable=True), - correspondent_id=NUMERIC(), - has_correspondent=BOOLEAN(), - tag=KEYWORD(commas=True, scorable=True, lowercase=True), - tag_id=KEYWORD(commas=True, scorable=True), - has_tag=BOOLEAN(), - type=TEXT(sortable=True), - type_id=NUMERIC(), - has_type=BOOLEAN(), - created=DATETIME(sortable=True), - modified=DATETIME(sortable=True), - added=DATETIME(sortable=True), - path=TEXT(sortable=True), - path_id=NUMERIC(), - has_path=BOOLEAN(), - notes=TEXT(), - num_notes=NUMERIC(sortable=True, signed=False), - custom_fields=TEXT(), - custom_field_count=NUMERIC(sortable=True, signed=False), - has_custom_fields=BOOLEAN(), - custom_fields_id=KEYWORD(commas=True), - owner=TEXT(), - owner_id=NUMERIC(), - has_owner=BOOLEAN(), - viewer_id=KEYWORD(commas=True), - checksum=TEXT(), - page_count=NUMERIC(sortable=True), - original_filename=TEXT(sortable=True), - is_shared=BOOLEAN(), - ) - - -def open_index(*, recreate=False) -> FileIndex: - transient_exceptions = (FileNotFoundError, LockError) - max_retries = 3 - retry_delay = 0.1 - - for attempt in range(max_retries + 1): - try: - if exists_in(settings.INDEX_DIR) and not recreate: - return open_dir(settings.INDEX_DIR, schema=get_schema()) - break - except transient_exceptions as exc: - is_last_attempt = attempt == max_retries or recreate - if is_last_attempt: - logger.exception( - "Error while opening the index after retries, recreating.", - ) - break - - logger.warning( - "Transient error while opening the index (attempt %s/%s): %s. Retrying.", - attempt + 1, - max_retries + 1, - exc, - ) - sleep(retry_delay) - except Exception: - logger.exception("Error while opening the index, recreating.") - break - - # create_in doesn't handle corrupted indexes very well, remove the directory entirely first - if settings.INDEX_DIR.is_dir(): - rmtree(settings.INDEX_DIR) - settings.INDEX_DIR.mkdir(parents=True, exist_ok=True) - - return create_in(settings.INDEX_DIR, get_schema()) - - -@contextmanager -def open_index_writer(*, optimize=False) -> AsyncWriter: - writer = AsyncWriter(open_index()) - - try: - yield writer - except Exception as e: - logger.exception(str(e)) - writer.cancel() - finally: - writer.commit(optimize=optimize) - - -@contextmanager -def open_index_searcher() -> Searcher: - searcher = open_index().searcher() - - try: - yield searcher - finally: - searcher.close() - - -def update_document( - writer: AsyncWriter, - doc: Document, - effective_content: str | None = None, -) -> None: - tags = ",".join([t.name for t in doc.tags.all()]) - tags_ids = ",".join([str(t.id) for t in doc.tags.all()]) - notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)]) - custom_fields = ",".join( - [str(c) for c in CustomFieldInstance.objects.filter(document=doc)], - ) - custom_fields_ids = ",".join( - [str(f.field.id) for f in CustomFieldInstance.objects.filter(document=doc)], - ) - asn: int | None = doc.archive_serial_number - if asn is not None and ( - asn < Document.ARCHIVE_SERIAL_NUMBER_MIN - or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX - ): - logger.error( - f"Not indexing Archive Serial Number {asn} of document {doc.pk}. " - f"ASN is out of range " - f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " - f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.", - ) - asn = 0 - users_with_perms = get_users_with_perms( - doc, - only_with_perms_in=["view_document"], - ) - viewer_ids: str = ",".join([str(u.id) for u in users_with_perms]) - writer.update_document( - id=doc.pk, - title=doc.title, - content=effective_content or doc.content, - correspondent=doc.correspondent.name if doc.correspondent else None, - correspondent_id=doc.correspondent.id if doc.correspondent else None, - has_correspondent=doc.correspondent is not None, - tag=tags if tags else None, - tag_id=tags_ids if tags_ids else None, - has_tag=len(tags) > 0, - type=doc.document_type.name if doc.document_type else None, - type_id=doc.document_type.id if doc.document_type else None, - has_type=doc.document_type is not None, - created=datetime.combine(doc.created, time.min), - added=doc.added, - asn=asn, - modified=doc.modified, - path=doc.storage_path.name if doc.storage_path else None, - path_id=doc.storage_path.id if doc.storage_path else None, - has_path=doc.storage_path is not None, - notes=notes, - num_notes=len(notes), - custom_fields=custom_fields, - custom_field_count=len(doc.custom_fields.all()), - has_custom_fields=len(custom_fields) > 0, - custom_fields_id=custom_fields_ids if custom_fields_ids else None, - owner=doc.owner.username if doc.owner else None, - owner_id=doc.owner.id if doc.owner else None, - has_owner=doc.owner is not None, - viewer_id=viewer_ids if viewer_ids else None, - checksum=doc.checksum, - page_count=doc.page_count, - original_filename=doc.original_filename, - is_shared=len(viewer_ids) > 0, - ) - logger.debug(f"Index updated for document {doc.pk}.") - - -def remove_document(writer: AsyncWriter, doc: Document) -> None: - remove_document_by_id(writer, doc.pk) - - -def remove_document_by_id(writer: AsyncWriter, doc_id) -> None: - writer.delete_by_term("id", doc_id) - - -def add_or_update_document( - document: Document, - effective_content: str | None = None, -) -> None: - with open_index_writer() as writer: - update_document(writer, document, effective_content=effective_content) - - -def remove_document_from_index(document: Document) -> None: - with open_index_writer() as writer: - remove_document(writer, document) - - -class MappedDocIdSet(DocIdSet): - """ - A DocIdSet backed by a set of `Document` IDs. - Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`. - """ - - def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None: - super().__init__() - document_ids = filter_queryset.order_by("id").values_list("id", flat=True) - max_id = document_ids.last() or 0 - self.document_ids = BitSet(document_ids, size=max_id) - self.ixreader = ixreader - - def __contains__(self, docnum) -> bool: - document_id = self.ixreader.stored_fields(docnum)["id"] - return document_id in self.document_ids - - def __bool__(self) -> Literal[True]: - # searcher.search ignores a filter if it's "falsy". - # We use this hack so this DocIdSet, when used as a filter, is never ignored. - return True - - -class DelayedQuery: - def _get_query(self): - raise NotImplementedError # pragma: no cover - - def _get_query_sortedby(self) -> tuple[None, Literal[False]] | tuple[str, bool]: - if "ordering" not in self.query_params: - return None, False - - field: str = self.query_params["ordering"] - - sort_fields_map: dict[str, str] = { - "created": "created", - "modified": "modified", - "added": "added", - "title": "title", - "correspondent__name": "correspondent", - "document_type__name": "type", - "archive_serial_number": "asn", - "num_notes": "num_notes", - "owner": "owner", - "page_count": "page_count", - } - - if field.startswith("-"): - field = field[1:] - reverse = True - else: - reverse = False - - if field not in sort_fields_map: - return None, False - else: - return sort_fields_map[field], reverse - - def __init__( - self, - searcher: Searcher, - query_params, - page_size, - filter_queryset: QuerySet, - ) -> None: - self.searcher = searcher - self.query_params = query_params - self.page_size = page_size - self.saved_results = dict() - self.first_score = None - self.filter_queryset = filter_queryset - self.suggested_correction = None - self._manual_hits_cache: list | None = None - - def __len__(self) -> int: - if self._manual_sort_requested(): - manual_hits = self._manual_hits() - return len(manual_hits) - - page = self[0:1] - return len(page) - - def _manual_sort_requested(self): - ordering = self.query_params.get("ordering", "") - return ordering.lstrip("-").startswith("custom_field_") - - def _manual_hits(self): - if self._manual_hits_cache is None: - q, mask, suggested_correction = self._get_query() - self.suggested_correction = suggested_correction - - results = self.searcher.search( - q, - mask=mask, - filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader), - limit=None, - ) - results.fragmenter = highlight.ContextFragmenter(surround=50) - results.formatter = HtmlFormatter(tagname="span", between=" ... ") - - if not self.first_score and len(results) > 0: - self.first_score = results[0].score - - if self.first_score: - results.top_n = [ - ( - (hit[0] / self.first_score) if self.first_score else None, - hit[1], - ) - for hit in results.top_n - ] - - hits_by_id = {hit["id"]: hit for hit in results} - matching_ids = list(hits_by_id.keys()) - - ordered_ids = list( - self.filter_queryset.filter(id__in=matching_ids).values_list( - "id", - flat=True, - ), - ) - ordered_ids = list(dict.fromkeys(ordered_ids)) - - self._manual_hits_cache = [ - hits_by_id[_id] for _id in ordered_ids if _id in hits_by_id - ] - return self._manual_hits_cache - - def get_result_ids(self) -> list[int]: - """ - Return all matching document IDs for the current query and ordering. - """ - if self._manual_sort_requested(): - return [hit["id"] for hit in self._manual_hits()] - - q, mask, suggested_correction = self._get_query() - self.suggested_correction = suggested_correction - sortedby, reverse = self._get_query_sortedby() - results = self.searcher.search( - q, - mask=mask, - filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader), - limit=None, - sortedby=sortedby, - reverse=reverse, - ) - return [hit["id"] for hit in results] - - def __getitem__(self, item): - if item.start in self.saved_results: - return self.saved_results[item.start] - - if self._manual_sort_requested(): - manual_hits = self._manual_hits() - start = 0 if item.start is None else item.start - stop = item.stop - hits = manual_hits[start:stop] if stop is not None else manual_hits[start:] - page = ManualResultsPage(hits) - self.saved_results[start] = page - return page - - q, mask, suggested_correction = self._get_query() - self.suggested_correction = suggested_correction - sortedby, reverse = self._get_query_sortedby() - - page: ResultsPage = self.searcher.search_page( - q, - mask=mask, - filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader), - pagenum=math.floor(item.start / self.page_size) + 1, - pagelen=self.page_size, - sortedby=sortedby, - reverse=reverse, - ) - page.results.fragmenter = highlight.ContextFragmenter(surround=50) - page.results.formatter = HtmlFormatter(tagname="span", between=" ... ") - - if not self.first_score and len(page.results) > 0 and sortedby is None: - self.first_score = page.results[0].score - - page.results.top_n = [ - ( - (hit[0] / self.first_score) if self.first_score else None, - hit[1], - ) - for hit in page.results.top_n - ] - - self.saved_results[item.start] = page - - return page - - -class ManualResultsPage(list): - def __init__(self, hits) -> None: - super().__init__(hits) - self.results = ManualResults(hits) - - -class ManualResults: - def __init__(self, hits) -> None: - self._docnums = [hit.docnum for hit in hits] - - def docs(self): - return self._docnums - - -class LocalDateParser(English): - def reverse_timezone_offset(self, d): - return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone( - UTC, - ) - - def date_from(self, *args, **kwargs): - d = super().date_from(*args, **kwargs) - if isinstance(d, timespan): - d.start = self.reverse_timezone_offset(d.start) - d.end = self.reverse_timezone_offset(d.end) - elif isinstance(d, datetime): - d = self.reverse_timezone_offset(d) - return d - - -class DelayedFullTextQuery(DelayedQuery): - def _get_query(self) -> tuple: - q_str = self.query_params["query"] - q_str = rewrite_natural_date_keywords(q_str) - qp = MultifieldParser( - [ - "content", - "title", - "correspondent", - "tag", - "type", - "notes", - "custom_fields", - ], - self.searcher.ixreader.schema, - ) - qp.add_plugin( - DateParserPlugin( - basedate=django_timezone.now(), - dateparser=LocalDateParser(), - ), - ) - q = qp.parse(q_str) - suggested_correction = None - try: - corrected = self.searcher.correct_query(q, q_str) - if corrected.string != q_str: - corrected_results = self.searcher.search( - corrected.query, - limit=1, - filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader), - scored=False, - ) - if len(corrected_results) > 0: - suggested_correction = corrected.string - except Exception as e: - logger.info( - "Error while correcting query %s: %s", - f"{q_str!r}", - e, - ) - - return q, None, suggested_correction - - -class DelayedMoreLikeThisQuery(DelayedQuery): - def _get_query(self) -> tuple: - more_like_doc_id = int(self.query_params["more_like_id"]) - content = Document.objects.get(id=more_like_doc_id).content - - docnum = self.searcher.document_number(id=more_like_doc_id) - kts = self.searcher.key_terms_from_text( - "content", - content, - numterms=20, - model=classify.Bo1Model, - normalize=False, - ) - q = query.Or( - [query.Term("content", word, boost=weight) for word, weight in kts], - ) - mask: set = {docnum} - - return q, mask, None - - -def autocomplete( - ix: FileIndex, - term: str, - limit: int = 10, - user: User | None = None, -) -> list: - """ - Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions - and without scoring - """ - terms = [] - - with ix.searcher(weighting=TF_IDF()) as s: - qp = QueryParser("content", schema=ix.schema) - # Don't let searches with a query that happen to match a field override the - # content field query instead and return bogus, not text data - qp.remove_plugin_class(FieldsPlugin) - q = qp.parse(f"{term.lower()}*") - user_criterias: list = get_permissions_criterias(user) - - results = s.search( - q, - terms=True, - filter=query.Or(user_criterias) if user_criterias is not None else None, - ) - - termCounts = Counter() - if results.has_matched_terms(): - for hit in results: - for _, match in hit.matched_terms(): - termCounts[match] += 1 - terms = [t for t, _ in termCounts.most_common(limit)] - - term_encoded: bytes = term.encode("UTF-8") - if term_encoded in terms: - terms.insert(0, terms.pop(terms.index(term_encoded))) - - return terms - - -def get_permissions_criterias(user: User | None = None) -> list: - user_criterias = [query.Term("has_owner", text=False)] - if user is not None: - if user.is_superuser: # superusers see all docs - user_criterias = [] - else: - user_criterias.append(query.Term("owner_id", user.id)) - user_criterias.append( - query.Term("viewer_id", str(user.id)), - ) - return user_criterias - - -def rewrite_natural_date_keywords(query_string: str) -> str: - """ - Rewrites natural date keywords (e.g. added:today or added:"yesterday") to UTC range syntax for Whoosh. - This resolves timezone issues with date parsing in Whoosh as well as adding support for more - natural date keywords. - """ - - tz = get_current_timezone() - local_now = now().astimezone(tz) - today = local_now.date() - - # all supported Keywords - pattern = r"(\b(?:added|created|modified))\s*:\s*[\"']?(today|yesterday|this month|previous month|previous week|previous quarter|this year|previous year)[\"']?" - - def repl(m): - field = m.group(1) - keyword = m.group(2).lower() - - match keyword: - case "today": - start = datetime.combine(today, time.min, tzinfo=tz) - end = datetime.combine(today, time.max, tzinfo=tz) - - case "yesterday": - yesterday = today - timedelta(days=1) - start = datetime.combine(yesterday, time.min, tzinfo=tz) - end = datetime.combine(yesterday, time.max, tzinfo=tz) - - case "this month": - start = datetime(local_now.year, local_now.month, 1, 0, 0, 0, tzinfo=tz) - end = start + relativedelta(months=1) - timedelta(seconds=1) - - case "previous month": - this_month_start = datetime( - local_now.year, - local_now.month, - 1, - 0, - 0, - 0, - tzinfo=tz, - ) - start = this_month_start - relativedelta(months=1) - end = this_month_start - timedelta(seconds=1) - - case "this year": - start = datetime(local_now.year, 1, 1, 0, 0, 0, tzinfo=tz) - end = datetime(local_now.year, 12, 31, 23, 59, 59, tzinfo=tz) - - case "previous week": - days_since_monday = local_now.weekday() - this_week_start = datetime.combine( - today - timedelta(days=days_since_monday), - time.min, - tzinfo=tz, - ) - start = this_week_start - timedelta(days=7) - end = this_week_start - timedelta(seconds=1) - - case "previous quarter": - current_quarter = (local_now.month - 1) // 3 + 1 - this_quarter_start_month = (current_quarter - 1) * 3 + 1 - this_quarter_start = datetime( - local_now.year, - this_quarter_start_month, - 1, - 0, - 0, - 0, - tzinfo=tz, - ) - start = this_quarter_start - relativedelta(months=3) - end = this_quarter_start - timedelta(seconds=1) - - case "previous year": - start = datetime(local_now.year - 1, 1, 1, 0, 0, 0, tzinfo=tz) - end = datetime(local_now.year - 1, 12, 31, 23, 59, 59, tzinfo=tz) - - # Convert to UTC and format - start_str = start.astimezone(UTC).strftime("%Y%m%d%H%M%S") - end_str = end.astimezone(UTC).strftime("%Y%m%d%H%M%S") - return f"{field}:[{start_str} TO {end_str}]" - - return re.sub(pattern, repl, query_string, flags=re.IGNORECASE) diff --git a/src/documents/management/commands/document_index.py b/src/documents/management/commands/document_index.py index 742922010..c4f72dd3a 100644 --- a/src/documents/management/commands/document_index.py +++ b/src/documents/management/commands/document_index.py @@ -1,11 +1,26 @@ +import logging + +from django.conf import settings from django.db import transaction from documents.management.commands.base import PaperlessCommand -from documents.tasks import index_optimize -from documents.tasks import index_reindex +from documents.models import Document +from documents.search import get_backend +from documents.search import needs_rebuild +from documents.search import reset_backend +from documents.search import wipe_index + +logger = logging.getLogger("paperless.management.document_index") class Command(PaperlessCommand): + """ + Django management command for search index operations. + + Provides subcommands for reindexing documents and optimizing the search index. + Supports conditional reindexing based on schema version and language changes. + """ + help = "Manages the document index." supports_progress_bar = True @@ -14,15 +29,49 @@ class Command(PaperlessCommand): def add_arguments(self, parser): super().add_arguments(parser) parser.add_argument("command", choices=["reindex", "optimize"]) + parser.add_argument( + "--recreate", + action="store_true", + default=False, + help="Wipe and recreate the index from scratch (only used with reindex).", + ) + parser.add_argument( + "--if-needed", + action="store_true", + default=False, + help=( + "Skip reindex if the index is already up to date. " + "Checks schema version and search language sentinels. " + "Safe to run on every startup or upgrade." + ), + ) def handle(self, *args, **options): with transaction.atomic(): if options["command"] == "reindex": - index_reindex( + if options.get("if_needed") and not needs_rebuild(settings.INDEX_DIR): + self.stdout.write("Search index is up to date.") + return + if options.get("recreate"): + wipe_index(settings.INDEX_DIR) + + documents = Document.objects.select_related( + "correspondent", + "document_type", + "storage_path", + "owner", + ).prefetch_related("tags", "notes", "custom_fields", "versions") + get_backend().rebuild( + documents, iter_wrapper=lambda docs: self.track( docs, description="Indexing documents...", ), ) + reset_backend() + elif options["command"] == "optimize": - index_optimize() + logger.info( + "document_index optimize is a no-op — Tantivy manages " + "segment merging automatically.", + ) diff --git a/src/documents/migrations/0017_migrate_fulltext_query_field_prefixes.py b/src/documents/migrations/0017_migrate_fulltext_query_field_prefixes.py new file mode 100644 index 000000000..040780a60 --- /dev/null +++ b/src/documents/migrations/0017_migrate_fulltext_query_field_prefixes.py @@ -0,0 +1,39 @@ +import re + +from django.db import migrations + +# Matches "note:" when NOT preceded by a word character or dot. +# This avoids false positives like "denote:" or already-migrated "notes.note:". +# Handles start-of-string, whitespace, parentheses, +/- operators per Whoosh syntax. +_NOTE_RE = re.compile(r"(? "custom_fields.value:" +_CUSTOM_FIELD_RE = re.compile(r"(? str: - value = ( - next( - option.get("label") - for option in self.field.extra_data["select_options"] - if option.get("id") == self.value_select - ) - if ( - self.field.data_type == CustomField.FieldDataType.SELECT - and self.value_select is not None - ) - else self.value - ) - return str(self.field.name) + f" : {value}" + return str(self.field.name) + f" : {self.value_for_search}" @classmethod def get_value_field_name(cls, data_type: CustomField.FieldDataType): @@ -1144,6 +1132,25 @@ class CustomFieldInstance(SoftDeleteModel): value_field_name = self.get_value_field_name(self.field.data_type) return getattr(self, value_field_name) + @property + def value_for_search(self) -> str | None: + """ + Return the value suitable for full-text indexing and display, or None + if the value is unset. + + For SELECT fields, resolves the human-readable label rather than the + opaque option ID stored in value_select. + """ + if self.value is None: + return None + if self.field.data_type == CustomField.FieldDataType.SELECT: + options = (self.field.extra_data or {}).get("select_options", []) + return next( + (o["label"] for o in options if o.get("id") == self.value), + None, + ) + return str(self.value) + if settings.AUDIT_LOG_ENABLED: auditlog.register( diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index b53ed8cfb..0b3dea368 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -9,19 +9,14 @@ to wrap the document queryset (e.g., with a progress bar). The default is an identity function that adds no overhead. """ -from __future__ import annotations - import logging import uuid from collections import defaultdict -from collections.abc import Callable -from collections.abc import Iterable from collections.abc import Iterator from pathlib import Path from typing import TYPE_CHECKING from typing import Final from typing import TypedDict -from typing import TypeVar from celery import states from django.conf import settings @@ -29,14 +24,13 @@ from django.utils import timezone from documents.models import Document from documents.models import PaperlessTask +from documents.utils import IterWrapper from documents.utils import compute_checksum +from documents.utils import identity from paperless.config import GeneralConfig logger = logging.getLogger("paperless.sanity_checker") -_T = TypeVar("_T") -IterWrapper = Callable[[Iterable[_T]], Iterable[_T]] - class MessageEntry(TypedDict): """A single sanity check message with its severity level.""" @@ -45,11 +39,6 @@ class MessageEntry(TypedDict): message: str -def _identity(iterable: Iterable[_T]) -> Iterable[_T]: - """Pass through an iterable unchanged (default iter_wrapper).""" - return iterable - - class SanityCheckMessages: """Collects sanity check messages grouped by document primary key. @@ -296,7 +285,7 @@ def _check_document( def check_sanity( *, scheduled: bool = True, - iter_wrapper: IterWrapper[Document] = _identity, + iter_wrapper: IterWrapper[Document] = identity, ) -> SanityCheckMessages: """Run a full sanity check on the document archive. diff --git a/src/documents/search/__init__.py b/src/documents/search/__init__.py new file mode 100644 index 000000000..b0a89f242 --- /dev/null +++ b/src/documents/search/__init__.py @@ -0,0 +1,21 @@ +from documents.search._backend import SearchIndexLockError +from documents.search._backend import SearchResults +from documents.search._backend import TantivyBackend +from documents.search._backend import TantivyRelevanceList +from documents.search._backend import WriteBatch +from documents.search._backend import get_backend +from documents.search._backend import reset_backend +from documents.search._schema import needs_rebuild +from documents.search._schema import wipe_index + +__all__ = [ + "SearchIndexLockError", + "SearchResults", + "TantivyBackend", + "TantivyRelevanceList", + "WriteBatch", + "get_backend", + "needs_rebuild", + "reset_backend", + "wipe_index", +] diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py new file mode 100644 index 000000000..a1bff8a9f --- /dev/null +++ b/src/documents/search/_backend.py @@ -0,0 +1,858 @@ +from __future__ import annotations + +import logging +import threading +import unicodedata +from collections import Counter +from dataclasses import dataclass +from datetime import UTC +from datetime import datetime +from typing import TYPE_CHECKING +from typing import Self +from typing import TypedDict +from typing import TypeVar + +import filelock +import regex +import tantivy +from django.conf import settings +from django.utils.timezone import get_current_timezone +from guardian.shortcuts import get_users_with_perms + +from documents.search._query import build_permission_filter +from documents.search._query import parse_user_query +from documents.search._schema import _write_sentinels +from documents.search._schema import build_schema +from documents.search._schema import open_or_rebuild_index +from documents.search._schema import wipe_index +from documents.search._tokenizer import register_tokenizers +from documents.utils import IterWrapper +from documents.utils import identity + +if TYPE_CHECKING: + from pathlib import Path + + from django.contrib.auth.base_user import AbstractBaseUser + from django.db.models import QuerySet + + from documents.models import Document + +logger = logging.getLogger("paperless.search") + +_WORD_RE = regex.compile(r"\w+") +_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content + +T = TypeVar("T") + + +def _ascii_fold(s: str) -> str: + """ + Normalize unicode to ASCII equivalent characters for search consistency. + + Converts accented characters (e.g., "café") to their ASCII base forms ("cafe") + to enable cross-language searching without requiring exact diacritic matching. + """ + return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode() + + +def _extract_autocomplete_words(text_sources: list[str]) -> set[str]: + """Extract and normalize words for autocomplete. + + Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases, + and ascii-folds each token. Uses the regex library with a timeout to guard against + ReDoS on untrusted document content. + """ + words = set() + for text in text_sources: + if not text: + continue + try: + tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + logger.warning( + "Autocomplete word extraction timed out for a text source; skipping.", + ) + continue + for token in tokens: + normalized = _ascii_fold(token.lower()) + if normalized: + words.add(normalized) + return words + + +class SearchHit(TypedDict): + """Type definition for search result hits.""" + + id: int + score: float + rank: int + highlights: dict[str, str] + + +@dataclass(frozen=True, slots=True) +class SearchResults: + """ + Container for search results with pagination metadata. + + Attributes: + hits: List of search results with scores and highlights + total: Total matching documents across all pages (for pagination) + query: Preprocessed query string after date/syntax rewriting + """ + + hits: list[SearchHit] + total: int # total matching documents (for pagination) + query: str # preprocessed query string + + +class TantivyRelevanceList: + """ + DRF-compatible list wrapper for Tantivy search hits. + + Provides paginated access to search results while storing all hits in memory + for efficient ID retrieval. Used by Django REST framework for pagination. + + Methods: + __len__: Returns total hit count for pagination calculations + __getitem__: Slices the hit list for page-specific results + + Note: Stores ALL post-filter hits so get_all_result_ids() can return + every matching document ID without requiring a second search query. + """ + + def __init__(self, hits: list[SearchHit]) -> None: + self._hits = hits + + def __len__(self) -> int: + return len(self._hits) + + def __getitem__(self, key: slice) -> list[SearchHit]: + return self._hits[key] + + +class SearchIndexLockError(Exception): + """Raised when the search index file lock cannot be acquired within the timeout.""" + + +class WriteBatch: + """ + Context manager for bulk index operations with file locking. + + Provides transactional batch updates to the search index with proper + concurrency control via file locking. All operations within the batch + are committed atomically or rolled back on exception. + + Usage: + with backend.batch_update() as batch: + batch.add_or_update(document) + batch.remove(doc_id) + """ + + def __init__(self, backend: TantivyBackend, lock_timeout: float): + self._backend = backend + self._lock_timeout = lock_timeout + self._writer = None + self._lock = None + + def __enter__(self) -> Self: + if self._backend._path is not None: + lock_path = self._backend._path / ".tantivy.lock" + self._lock = filelock.FileLock(str(lock_path)) + try: + self._lock.acquire(timeout=self._lock_timeout) + except filelock.Timeout as e: # pragma: no cover + raise SearchIndexLockError( + f"Could not acquire index lock within {self._lock_timeout}s", + ) from e + + self._writer = self._backend._index.writer() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + if exc_type is None: + self._writer.commit() + self._backend._index.reload() + # Explicitly delete writer to release tantivy's internal lock. + # On exception the uncommitted writer is simply discarded. + if self._writer is not None: + del self._writer + self._writer = None + finally: + if self._lock is not None: + self._lock.release() + + def add_or_update( + self, + document: Document, + effective_content: str | None = None, + ) -> None: + """ + Add or update a document in the batch. + + Implements upsert behavior by deleting any existing document with the same ID + and adding the new version. This ensures stale document data (e.g., after + permission changes) doesn't persist in the index. + + Args: + document: Django Document instance to index + effective_content: Override document.content for indexing (used when + re-indexing with newer OCR text from document versions) + """ + self.remove(document.pk) + doc = self._backend._build_tantivy_doc(document, effective_content) + self._writer.add_document(doc) + + def remove(self, doc_id: int) -> None: + """ + Remove a document from the batch by its primary key. + + Uses range query instead of term query to work around unsigned integer + type detection bug in tantivy-py 0.25. + """ + # Use range query to work around u64 deletion bug + self._writer.delete_documents_by_query( + tantivy.Query.range_query( + self._backend._schema, + "id", + tantivy.FieldType.Unsigned, + doc_id, + doc_id, + ), + ) + + +class TantivyBackend: + """ + Tantivy search backend with explicit lifecycle management. + + Provides full-text search capabilities using the Tantivy search engine. + Supports in-memory indexes (for testing) and persistent on-disk indexes + (for production use). Handles document indexing, search queries, autocompletion, + and "more like this" functionality. + + The backend manages its own connection lifecycle and can be reset when + the underlying index directory changes (e.g., during test isolation). + """ + + def __init__(self, path: Path | None = None): + # path=None → in-memory index (for tests) + # path=some_dir → on-disk index (for production) + self._path = path + self._index = None + self._schema = None + + def open(self) -> None: + """ + Open or rebuild the index as needed. + + For disk-based indexes, checks if rebuilding is needed due to schema + version or language changes. Registers custom tokenizers after opening. + Safe to call multiple times - subsequent calls are no-ops. + """ + if self._index is not None: + return # pragma: no cover + if self._path is not None: + self._index = open_or_rebuild_index(self._path) + else: + self._index = tantivy.Index(build_schema()) + register_tokenizers(self._index, settings.SEARCH_LANGUAGE) + self._schema = self._index.schema + + def close(self) -> None: + """ + Close the index and release resources. + + Safe to call multiple times - subsequent calls are no-ops. + """ + self._index = None + self._schema = None + + def _ensure_open(self) -> None: + """Ensure the index is open before operations.""" + if self._index is None: + self.open() # pragma: no cover + + def _build_tantivy_doc( + self, + document: Document, + effective_content: str | None = None, + ) -> tantivy.Document: + """Build a tantivy Document from a Django Document instance. + + ``effective_content`` overrides ``document.content`` for indexing — + used when re-indexing a root document with a newer version's OCR text. + """ + content = ( + effective_content if effective_content is not None else document.content + ) + + doc = tantivy.Document() + + # Basic fields + doc.add_unsigned("id", document.pk) + doc.add_text("checksum", document.checksum) + doc.add_text("title", document.title) + doc.add_text("title_sort", document.title) + doc.add_text("content", content) + doc.add_text("bigram_content", content) + + # Original filename - only add if not None/empty + if document.original_filename: + doc.add_text("original_filename", document.original_filename) + + # Correspondent + if document.correspondent: + doc.add_text("correspondent", document.correspondent.name) + doc.add_text("correspondent_sort", document.correspondent.name) + doc.add_unsigned("correspondent_id", document.correspondent_id) + + # Document type + if document.document_type: + doc.add_text("document_type", document.document_type.name) + doc.add_text("type_sort", document.document_type.name) + doc.add_unsigned("document_type_id", document.document_type_id) + + # Storage path + if document.storage_path: + doc.add_text("storage_path", document.storage_path.name) + doc.add_unsigned("storage_path_id", document.storage_path_id) + + # Tags — collect names for autocomplete in the same pass + tag_names: list[str] = [] + for tag in document.tags.all(): + doc.add_text("tag", tag.name) + doc.add_unsigned("tag_id", tag.pk) + tag_names.append(tag.name) + + # Notes — JSON for structured queries (notes.user:alice, notes.note:text), + # companion text field for default full-text search. + num_notes = 0 + for note in document.notes.all(): + num_notes += 1 + doc.add_json("notes", {"note": note.note, "user": note.user.username}) + + # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y), + # companion text field for default full-text search. + for cfi in document.custom_fields.all(): + search_value = cfi.value_for_search + # Skip fields where there is no value yet + if search_value is None: + continue + doc.add_json( + "custom_fields", + { + "name": cfi.field.name, + "value": search_value, + }, + ) + + # Dates + created_date = datetime( + document.created.year, + document.created.month, + document.created.day, + tzinfo=UTC, + ) + doc.add_date("created", created_date) + doc.add_date("modified", document.modified) + doc.add_date("added", document.added) + + if document.archive_serial_number is not None: + doc.add_unsigned("asn", document.archive_serial_number) + + if document.page_count is not None: + doc.add_unsigned("page_count", document.page_count) + + doc.add_unsigned("num_notes", num_notes) + + # Owner + if document.owner_id: + doc.add_unsigned("owner_id", document.owner_id) + + # Viewers with permission + users_with_perms = get_users_with_perms( + document, + only_with_perms_in=["view_document"], + ) + for user in users_with_perms: + doc.add_unsigned("viewer_id", user.pk) + + # Autocomplete words + text_sources = [document.title, content] + if document.correspondent: + text_sources.append(document.correspondent.name) + if document.document_type: + text_sources.append(document.document_type.name) + text_sources.extend(tag_names) + + for word in sorted(_extract_autocomplete_words(text_sources)): + doc.add_text("autocomplete_word", word) + + return doc + + def add_or_update( + self, + document: Document, + effective_content: str | None = None, + ) -> None: + """ + Add or update a single document with file locking. + + Convenience method for single-document updates. For bulk operations, + use batch_update() context manager for better performance. + + Args: + document: Django Document instance to index + effective_content: Override document.content for indexing + """ + self._ensure_open() + with self.batch_update(lock_timeout=5.0) as batch: + batch.add_or_update(document, effective_content) + + def remove(self, doc_id: int) -> None: + """ + Remove a single document from the index with file locking. + + Convenience method for single-document removal. For bulk operations, + use batch_update() context manager for better performance. + + Args: + doc_id: Primary key of the document to remove + """ + self._ensure_open() + with self.batch_update(lock_timeout=5.0) as batch: + batch.remove(doc_id) + + def search( + self, + query: str, + user: AbstractBaseUser | None, + page: int, + page_size: int, + sort_field: str | None, + *, + sort_reverse: bool, + ) -> SearchResults: + """ + Execute a search query against the document index. + + Processes the user query through date rewriting, normalization, and + permission filtering before executing against Tantivy. Supports both + relevance-based and field-based sorting. + + Args: + query: User's search query (supports natural date keywords, field filters) + user: User for permission filtering (None for superuser/no filtering) + page: Page number (1-indexed) for pagination + page_size: Number of results per page + sort_field: Field to sort by (None for relevance ranking) + sort_reverse: Whether to reverse the sort order + + Returns: + SearchResults with hits, total count, and processed query + """ + self._ensure_open() + tz = get_current_timezone() + user_query = parse_user_query(self._index, query, tz) + + # Apply permission filter if user is not None (not superuser) + if user is not None: + permission_filter = build_permission_filter(self._schema, user) + final_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, user_query), + (tantivy.Occur.Must, permission_filter), + ], + ) + else: + final_query = user_query + + searcher = self._index.searcher() + offset = (page - 1) * page_size + + # Map sort fields + sort_field_map = { + "title": "title_sort", + "correspondent__name": "correspondent_sort", + "document_type__name": "type_sort", + "created": "created", + "added": "added", + "modified": "modified", + "archive_serial_number": "asn", + "page_count": "page_count", + "num_notes": "num_notes", + } + + # Perform search + if sort_field and sort_field in sort_field_map: + mapped_field = sort_field_map[sort_field] + results = searcher.search( + final_query, + limit=offset + page_size, + order_by_field=mapped_field, + order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc, + ) + # Field sorting: hits are still (score, DocAddress) tuples; score unused + all_hits = [(hit[1], 0.0) for hit in results.hits] + else: + # Score-based search: hits are (score, DocAddress) tuples + results = searcher.search(final_query, limit=offset + page_size) + all_hits = [(hit[1], hit[0]) for hit in results.hits] + + total = results.count + + # Normalize scores for score-based searches + if not sort_field and all_hits: + max_score = max(hit[1] for hit in all_hits) or 1.0 + all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] + + # Apply threshold filter if configured (score-based search only) + threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD + if threshold is not None and not sort_field: + all_hits = [hit for hit in all_hits if hit[1] >= threshold] + + # Get the page's hits + page_hits = all_hits[offset : offset + page_size] + + # Build result hits with highlights + hits: list[SearchHit] = [] + snippet_generator = None + + for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): + # Get the actual document from the searcher using the doc address + actual_doc = searcher.doc(doc_address) + doc_dict = actual_doc.to_dict() + doc_id = doc_dict["id"][0] + + highlights: dict[str, str] = {} + + # Generate highlights if score > 0 + if score > 0: + try: + if snippet_generator is None: + snippet_generator = tantivy.SnippetGenerator.create( + searcher, + final_query, + self._schema, + "content", + ) + + content_snippet = snippet_generator.snippet_from_doc(actual_doc) + if content_snippet: + highlights["content"] = str(content_snippet) + + # Try notes highlights + if "notes" in doc_dict: + notes_generator = tantivy.SnippetGenerator.create( + searcher, + final_query, + self._schema, + "notes", + ) + notes_snippet = notes_generator.snippet_from_doc(actual_doc) + if notes_snippet: + highlights["notes"] = str(notes_snippet) + + except Exception: # pragma: no cover + logger.debug("Failed to generate highlights for doc %s", doc_id) + + hits.append( + SearchHit( + id=doc_id, + score=score, + rank=rank, + highlights=highlights, + ), + ) + + return SearchResults( + hits=hits, + total=total, + query=query, + ) + + def autocomplete( + self, + term: str, + limit: int, + user: AbstractBaseUser | None = None, + ) -> list[str]: + """ + Get autocomplete suggestions for search queries. + + Returns words that start with the given term prefix, ranked by document + frequency (how many documents contain each word). Optionally filters + results to only words from documents visible to the specified user. + + Args: + term: Prefix to match against autocomplete words + limit: Maximum number of suggestions to return + user: User for permission filtering (None for no filtering) + + Returns: + List of word suggestions ordered by frequency, then alphabetically + """ + self._ensure_open() + normalized_term = _ascii_fold(term.lower()) + + searcher = self._index.searcher() + + # Apply permission filter for non-superusers so autocomplete words + # from invisible documents don't leak to other users. + if user is not None and not user.is_superuser: + base_query = build_permission_filter(self._schema, user) + else: + base_query = tantivy.Query.all_query() + + results = searcher.search(base_query, limit=10000) + + # Count how many visible documents each word appears in. + # Using Counter (not set) preserves per-word document frequency so + # we can rank suggestions by how commonly they occur — the same + # signal Whoosh used for Tf/Idf-based autocomplete ordering. + word_counts: Counter[str] = Counter() + for _score, doc_address in results.hits: + stored_doc = searcher.doc(doc_address) + doc_dict = stored_doc.to_dict() + if "autocomplete_word" in doc_dict: + word_counts.update(doc_dict["autocomplete_word"]) + + # Filter to prefix matches, sort by document frequency descending; + # break ties alphabetically for stable, deterministic output. + matches = sorted( + (w for w in word_counts if w.startswith(normalized_term)), + key=lambda w: (-word_counts[w], w), + ) + + return matches[:limit] + + def more_like_this( + self, + doc_id: int, + user: AbstractBaseUser | None, + page: int, + page_size: int, + ) -> SearchResults: + """ + Find documents similar to the given document using content analysis. + + Uses Tantivy's "more like this" query to find documents with similar + content patterns. The original document is excluded from results. + + Args: + doc_id: Primary key of the reference document + user: User for permission filtering (None for no filtering) + page: Page number (1-indexed) for pagination + page_size: Number of results per page + + Returns: + SearchResults with similar documents (excluding the original) + """ + self._ensure_open() + searcher = self._index.searcher() + + # First find the document address + id_query = tantivy.Query.range_query( + self._schema, + "id", + tantivy.FieldType.Unsigned, + doc_id, + doc_id, + ) + results = searcher.search(id_query, limit=1) + + if not results.hits: + # Document not found + return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}") + + # Extract doc_address from (score, doc_address) tuple + doc_address = results.hits[0][1] + + # Build more like this query + mlt_query = tantivy.Query.more_like_this_query( + doc_address, + min_doc_frequency=1, + max_doc_frequency=None, + min_term_frequency=1, + max_query_terms=12, + min_word_length=None, + max_word_length=None, + boost_factor=None, + ) + + # Apply permission filter + if user is not None: + permission_filter = build_permission_filter(self._schema, user) + final_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, mlt_query), + (tantivy.Occur.Must, permission_filter), + ], + ) + else: + final_query = mlt_query + + # Search + offset = (page - 1) * page_size + results = searcher.search(final_query, limit=offset + page_size) + + total = results.count + # Convert from (score, doc_address) to (doc_address, score) + all_hits = [(hit[1], hit[0]) for hit in results.hits] + + # Normalize scores + if all_hits: + max_score = max(hit[1] for hit in all_hits) or 1.0 + all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] + + # Get page hits + page_hits = all_hits[offset : offset + page_size] + + # Build results + hits: list[SearchHit] = [] + for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): + actual_doc = searcher.doc(doc_address) + doc_dict = actual_doc.to_dict() + result_doc_id = doc_dict["id"][0] + + # Skip the original document + if result_doc_id == doc_id: + continue + + hits.append( + SearchHit( + id=result_doc_id, + score=score, + rank=rank, + highlights={}, # MLT doesn't generate highlights + ), + ) + + return SearchResults( + hits=hits, + total=max(0, total - 1), # Subtract 1 for the original document + query=f"more_like:{doc_id}", + ) + + def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: + """ + Get a batch context manager for bulk index operations. + + Use this for efficient bulk document updates/deletions. All operations + within the batch are committed atomically at the end of the context. + + Args: + lock_timeout: Seconds to wait for file lock acquisition + + Returns: + WriteBatch context manager + + Raises: + SearchIndexLockError: If lock cannot be acquired within timeout + """ + self._ensure_open() + return WriteBatch(self, lock_timeout) + + def rebuild( + self, + documents: QuerySet[Document], + iter_wrapper: IterWrapper[Document] = identity, + ) -> None: + """ + Rebuild the entire search index from scratch. + + Wipes the existing index and re-indexes all provided documents. + On failure, restores the previous index state to keep the backend usable. + + Args: + documents: QuerySet of Document instances to index + iter_wrapper: Optional wrapper function for progress tracking + (e.g., progress bar). Should yield each document unchanged. + """ + # Create new index (on-disk or in-memory) + if self._path is not None: + wipe_index(self._path) + new_index = tantivy.Index(build_schema(), path=str(self._path)) + _write_sentinels(self._path) + else: + new_index = tantivy.Index(build_schema()) + register_tokenizers(new_index, settings.SEARCH_LANGUAGE) + + # Point instance at the new index so _build_tantivy_doc uses it + old_index, old_schema = self._index, self._schema + self._index = new_index + self._schema = new_index.schema + + try: + writer = new_index.writer() + for document in iter_wrapper(documents): + doc = self._build_tantivy_doc( + document, + document.get_effective_content(), + ) + writer.add_document(doc) + writer.commit() + new_index.reload() + except BaseException: # pragma: no cover + # Restore old index on failure so the backend remains usable + self._index = old_index + self._schema = old_schema + raise + + +# Module-level singleton with proper thread safety +_backend: TantivyBackend | None = None +_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses +_backend_lock = threading.RLock() + + +def get_backend() -> TantivyBackend: + """ + Get the global backend instance with thread safety. + + Returns a singleton TantivyBackend instance, automatically reinitializing + when settings.INDEX_DIR changes. This ensures proper test isolation when + using pytest-xdist or @override_settings that change the index directory. + + Returns: + Thread-safe singleton TantivyBackend instance + """ + global _backend, _backend_path + + current_path: Path = settings.INDEX_DIR + + # Fast path: backend is initialized and path hasn't changed (no lock needed) + if _backend is not None and _backend_path == current_path: + return _backend + + # Slow path: first call, or INDEX_DIR changed between calls + with _backend_lock: + # Double-check after acquiring lock — another thread may have beaten us + if _backend is not None and _backend_path == current_path: + return _backend # pragma: no cover + + if _backend is not None: + _backend.close() + + _backend = TantivyBackend(path=current_path) + _backend.open() + _backend_path = current_path + + return _backend + + +def reset_backend() -> None: + """ + Reset the global backend instance with thread safety. + + Forces creation of a new backend instance on the next get_backend() call. + Used for test isolation and when switching between different index directories. + """ + global _backend, _backend_path + + with _backend_lock: + if _backend is not None: + _backend.close() + _backend = None + _backend_path = None diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py new file mode 100644 index 000000000..212df1516 --- /dev/null +++ b/src/documents/search/_query.py @@ -0,0 +1,497 @@ +from __future__ import annotations + +from datetime import UTC +from datetime import date +from datetime import datetime +from datetime import timedelta +from typing import TYPE_CHECKING +from typing import Final + +import regex +import tantivy +from dateutil.relativedelta import relativedelta +from django.conf import settings + +if TYPE_CHECKING: + from datetime import tzinfo + + from django.contrib.auth.base_user import AbstractBaseUser + +# Maximum seconds any single regex substitution may run. +# Prevents ReDoS on adversarial user-supplied query strings. +_REGEX_TIMEOUT: Final[float] = 1.0 + +_DATE_ONLY_FIELDS = frozenset({"created"}) + +_DATE_KEYWORDS = frozenset( + { + "today", + "yesterday", + "this_week", + "last_week", + "this_month", + "last_month", + "this_year", + "last_year", + }, +) + +_FIELD_DATE_RE = regex.compile( + r"(\w+):(" + "|".join(_DATE_KEYWORDS) + r")\b", +) +_COMPACT_DATE_RE = regex.compile(r"\b(\d{14})\b") +_RELATIVE_RANGE_RE = regex.compile( + r"\[now([+-]\d+[dhm])?\s+TO\s+now([+-]\d+[dhm])?\]", + regex.IGNORECASE, +) +# Whoosh-style relative date range: e.g. [-1 week to now], [-7 days to now] +_WHOOSH_REL_RANGE_RE = regex.compile( + r"\[-(?P\d+)\s+(?Psecond|minute|hour|day|week|month|year)s?\s+to\s+now\]", + regex.IGNORECASE, +) +# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly +_DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") + + +def _fmt(dt: datetime) -> str: + """Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries.""" + return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _iso_range(lo: datetime, hi: datetime) -> str: + """Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax.""" + return f"[{_fmt(lo)} TO {_fmt(hi)}]" + + +def _date_only_range(keyword: str, tz: tzinfo) -> str: + """ + For `created` (DateField): use the local calendar date, converted to + midnight UTC boundaries. No offset arithmetic — date only. + """ + + today = datetime.now(tz).date() + + if keyword == "today": + lo = datetime(today.year, today.month, today.day, tzinfo=UTC) + return _iso_range(lo, lo + timedelta(days=1)) + if keyword == "yesterday": + y = today - timedelta(days=1) + lo = datetime(y.year, y.month, y.day, tzinfo=UTC) + hi = datetime(today.year, today.month, today.day, tzinfo=UTC) + return _iso_range(lo, hi) + if keyword == "this_week": + mon = today - timedelta(days=today.weekday()) + lo = datetime(mon.year, mon.month, mon.day, tzinfo=UTC) + return _iso_range(lo, lo + timedelta(weeks=1)) + if keyword == "last_week": + this_mon = today - timedelta(days=today.weekday()) + last_mon = this_mon - timedelta(weeks=1) + lo = datetime(last_mon.year, last_mon.month, last_mon.day, tzinfo=UTC) + hi = datetime(this_mon.year, this_mon.month, this_mon.day, tzinfo=UTC) + return _iso_range(lo, hi) + if keyword == "this_month": + lo = datetime(today.year, today.month, 1, tzinfo=UTC) + if today.month == 12: + hi = datetime(today.year + 1, 1, 1, tzinfo=UTC) + else: + hi = datetime(today.year, today.month + 1, 1, tzinfo=UTC) + return _iso_range(lo, hi) + if keyword == "last_month": + if today.month == 1: + lo = datetime(today.year - 1, 12, 1, tzinfo=UTC) + else: + lo = datetime(today.year, today.month - 1, 1, tzinfo=UTC) + hi = datetime(today.year, today.month, 1, tzinfo=UTC) + return _iso_range(lo, hi) + if keyword == "this_year": + lo = datetime(today.year, 1, 1, tzinfo=UTC) + return _iso_range(lo, datetime(today.year + 1, 1, 1, tzinfo=UTC)) + if keyword == "last_year": + lo = datetime(today.year - 1, 1, 1, tzinfo=UTC) + return _iso_range(lo, datetime(today.year, 1, 1, tzinfo=UTC)) + raise ValueError(f"Unknown keyword: {keyword}") + + +def _datetime_range(keyword: str, tz: tzinfo) -> str: + """ + For `added` / `modified` (DateTimeField, stored as UTC): convert local day + boundaries to UTC — full offset arithmetic required. + """ + + now_local = datetime.now(tz) + today = now_local.date() + + def _midnight(d: date) -> datetime: + return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC) + + if keyword == "today": + return _iso_range(_midnight(today), _midnight(today + timedelta(days=1))) + if keyword == "yesterday": + y = today - timedelta(days=1) + return _iso_range(_midnight(y), _midnight(today)) + if keyword == "this_week": + mon = today - timedelta(days=today.weekday()) + return _iso_range(_midnight(mon), _midnight(mon + timedelta(weeks=1))) + if keyword == "last_week": + this_mon = today - timedelta(days=today.weekday()) + last_mon = this_mon - timedelta(weeks=1) + return _iso_range(_midnight(last_mon), _midnight(this_mon)) + if keyword == "this_month": + first = today.replace(day=1) + if today.month == 12: + next_first = date(today.year + 1, 1, 1) + else: + next_first = date(today.year, today.month + 1, 1) + return _iso_range(_midnight(first), _midnight(next_first)) + if keyword == "last_month": + this_first = today.replace(day=1) + if today.month == 1: + last_first = date(today.year - 1, 12, 1) + else: + last_first = date(today.year, today.month - 1, 1) + return _iso_range(_midnight(last_first), _midnight(this_first)) + if keyword == "this_year": + return _iso_range( + _midnight(date(today.year, 1, 1)), + _midnight(date(today.year + 1, 1, 1)), + ) + if keyword == "last_year": + return _iso_range( + _midnight(date(today.year - 1, 1, 1)), + _midnight(date(today.year, 1, 1)), + ) + raise ValueError(f"Unknown keyword: {keyword}") + + +def _rewrite_compact_date(query: str) -> str: + """Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601.""" + + def _sub(m: regex.Match[str]) -> str: + raw = m.group(1) + try: + dt = datetime( + int(raw[0:4]), + int(raw[4:6]), + int(raw[6:8]), + int(raw[8:10]), + int(raw[10:12]), + int(raw[12:14]), + tzinfo=UTC, + ) + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + except ValueError: + return str(m.group(0)) + + try: + return _COMPACT_DATE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError( + "Query too complex to process (compact date rewrite timed out)", + ) + + +def _rewrite_relative_range(query: str) -> str: + """Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries.""" + + def _sub(m: regex.Match[str]) -> str: + now = datetime.now(UTC) + + def _offset(s: str | None) -> timedelta: + if not s: + return timedelta(0) + sign = 1 if s[0] == "+" else -1 + n, unit = int(s[1:-1]), s[-1] + return ( + sign + * { + "d": timedelta(days=n), + "h": timedelta(hours=n), + "m": timedelta(minutes=n), + }[unit] + ) + + lo, hi = now + _offset(m.group(1)), now + _offset(m.group(2)) + if lo > hi: + lo, hi = hi, lo + return f"[{_fmt(lo)} TO {_fmt(hi)}]" + + try: + return _RELATIVE_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError( + "Query too complex to process (relative range rewrite timed out)", + ) + + +def _rewrite_whoosh_relative_range(query: str) -> str: + """Rewrite Whoosh-style relative date ranges ([-N unit to now]) to ISO 8601. + + Supports: second, minute, hour, day, week, month, year (singular and plural). + Example: ``added:[-1 week to now]`` → ``added:[2025-01-01T… TO 2025-01-08T…]`` + """ + now = datetime.now(UTC) + + def _sub(m: regex.Match[str]) -> str: + n = int(m.group("n")) + unit = m.group("unit").lower() + delta_map: dict[str, timedelta | relativedelta] = { + "second": timedelta(seconds=n), + "minute": timedelta(minutes=n), + "hour": timedelta(hours=n), + "day": timedelta(days=n), + "week": timedelta(weeks=n), + "month": relativedelta(months=n), + "year": relativedelta(years=n), + } + lo = now - delta_map[unit] + return f"[{_fmt(lo)} TO {_fmt(now)}]" + + try: + return _WHOOSH_REL_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError( + "Query too complex to process (Whoosh relative range rewrite timed out)", + ) + + +def _rewrite_8digit_date(query: str, tz: tzinfo) -> str: + """Rewrite field:YYYYMMDD date tokens to an ISO 8601 day range. + + Runs after ``_rewrite_compact_date`` so 14-digit timestamps are already + converted and won't spuriously match here. + + For DateField fields (e.g. ``created``) uses UTC midnight boundaries. + For DateTimeField fields (e.g. ``added``, ``modified``) uses local TZ + midnight boundaries converted to UTC — matching the ``_datetime_range`` + behaviour for keyword dates. + """ + + def _sub(m: regex.Match[str]) -> str: + field = m.group("field") + raw = m.group("date8") + try: + year, month, day = int(raw[0:4]), int(raw[4:6]), int(raw[6:8]) + d = date(year, month, day) + if field in _DATE_ONLY_FIELDS: + lo = datetime(d.year, d.month, d.day, tzinfo=UTC) + hi = lo + timedelta(days=1) + else: + # DateTimeField: use local-timezone midnight → UTC + lo = datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC) + hi = datetime( + (d + timedelta(days=1)).year, + (d + timedelta(days=1)).month, + (d + timedelta(days=1)).day, + tzinfo=tz, + ).astimezone(UTC) + return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]" + except ValueError: + return m.group(0) + + try: + return _DATE8_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError( + "Query too complex to process (8-digit date rewrite timed out)", + ) + + +def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: + """ + Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility. + + Performs the first stage of query preprocessing, converting various date + formats and keywords to ISO 8601 datetime ranges that Tantivy can parse: + - Compact 14-digit dates (YYYYMMDDHHmmss) + - Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h]) + - 8-digit dates with field awareness (created:20240115) + - Natural keywords (field:today, field:last_week, etc.) + + Args: + query: Raw user query string + tz: Timezone for converting local date boundaries to UTC + + Returns: + Query with date syntax rewritten to ISO 8601 ranges + + Note: + Bare keywords without field prefixes pass through unchanged. + """ + query = _rewrite_compact_date(query) + query = _rewrite_whoosh_relative_range(query) + query = _rewrite_8digit_date(query, tz) + query = _rewrite_relative_range(query) + + def _replace(m: regex.Match[str]) -> str: + field, keyword = m.group(1), m.group(2) + if field in _DATE_ONLY_FIELDS: + return f"{field}:{_date_only_range(keyword, tz)}" + return f"{field}:{_datetime_range(keyword, tz)}" + + try: + return _FIELD_DATE_RE.sub(_replace, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError( + "Query too complex to process (date keyword rewrite timed out)", + ) + + +def normalize_query(query: str) -> str: + """ + Normalize query syntax for better search behavior. + + Expands comma-separated field values to explicit AND clauses and + collapses excessive whitespace for cleaner parsing: + - tag:foo,bar → tag:foo AND tag:bar + - multiple spaces → single spaces + + Args: + query: Query string after date rewriting + + Returns: + Normalized query string ready for Tantivy parsing + """ + + def _expand(m: regex.Match[str]) -> str: + field = m.group(1) + values = [v.strip() for v in m.group(2).split(",") if v.strip()] + return " AND ".join(f"{field}:{v}" for v in values) + + try: + query = regex.sub( + r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", + _expand, + query, + timeout=_REGEX_TIMEOUT, + ) + return regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip() + except TimeoutError: # pragma: no cover + raise ValueError("Query too complex to process (normalization timed out)") + + +_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range + + +def build_permission_filter( + schema: tantivy.Schema, + user: AbstractBaseUser, +) -> tantivy.Query: + """ + Build a query filter for user document permissions. + + Creates a query that matches only documents visible to the specified user + according to paperless-ngx permission rules: + - Public documents (no owner) are visible to all users + - Private documents are visible to their owner + - Documents explicitly shared with the user are visible + + Args: + schema: Tantivy schema for field validation + user: User to check permissions for + + Returns: + Tantivy query that filters results to visible documents + + Implementation Notes: + - Uses range_query instead of term_query to work around unsigned integer + type detection bug in tantivy-py 0.25 + - Uses boolean_query for "no owner" check since exists_query is not + available in tantivy-py 0.25.1 (available in master) + - Uses disjunction_max_query to combine permission clauses with OR logic + """ + owner_any = tantivy.Query.range_query( + schema, + "owner_id", + tantivy.FieldType.Unsigned, + 1, + _MAX_U64, + ) + no_owner = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, tantivy.Query.all_query()), + (tantivy.Occur.MustNot, owner_any), + ], + ) + owned = tantivy.Query.range_query( + schema, + "owner_id", + tantivy.FieldType.Unsigned, + user.pk, + user.pk, + ) + shared = tantivy.Query.range_query( + schema, + "viewer_id", + tantivy.FieldType.Unsigned, + user.pk, + user.pk, + ) + return tantivy.Query.disjunction_max_query([no_owner, owned, shared]) + + +DEFAULT_SEARCH_FIELDS = [ + "title", + "content", + "correspondent", + "document_type", + "tag", +] +_FIELD_BOOSTS = {"title": 2.0} + + +def parse_user_query( + index: tantivy.Index, + raw_query: str, + tz: tzinfo, +) -> tantivy.Query: + """ + Parse user query through the complete preprocessing pipeline. + + Transforms the raw user query through multiple stages: + 1. Date keyword rewriting (today → ISO 8601 ranges) + 2. Query normalization (comma expansion, whitespace cleanup) + 3. Tantivy parsing with field boosts + 4. Optional fuzzy query blending (if ADVANCED_FUZZY_SEARCH_THRESHOLD set) + + Args: + index: Tantivy index with registered tokenizers + raw_query: Original user query string + tz: Timezone for date boundary calculations + + Returns: + Parsed Tantivy query ready for execution + + Note: + When ADVANCED_FUZZY_SEARCH_THRESHOLD is configured, adds a low-priority + fuzzy query as a Should clause (0.1 boost) to catch approximate matches + while keeping exact matches ranked higher. The threshold value is applied + as a post-search score filter, not during query construction. + """ + + query_str = rewrite_natural_date_keywords(raw_query, tz) + query_str = normalize_query(query_str) + + exact = index.parse_query( + query_str, + DEFAULT_SEARCH_FIELDS, + field_boosts=_FIELD_BOOSTS, + ) + + threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD + if threshold is not None: + fuzzy = index.parse_query( + query_str, + DEFAULT_SEARCH_FIELDS, + field_boosts=_FIELD_BOOSTS, + # (prefix=True, distance=1, transposition_cost_one=True) — edit-distance fuzziness + fuzzy_fields={f: (True, 1, True) for f in DEFAULT_SEARCH_FIELDS}, + ) + return tantivy.Query.boolean_query( + [ + (tantivy.Occur.Should, exact), + # 0.1 boost keeps fuzzy hits ranked below exact matches (intentional) + (tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1)), + ], + ) + + return exact diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py new file mode 100644 index 000000000..ba6646007 --- /dev/null +++ b/src/documents/search/_schema.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import logging +import shutil +from typing import TYPE_CHECKING + +import tantivy +from django.conf import settings + +if TYPE_CHECKING: + from pathlib import Path + +logger = logging.getLogger("paperless.search") + +SCHEMA_VERSION = 1 + + +def build_schema() -> tantivy.Schema: + """ + Build the Tantivy schema for the paperless document index. + + Creates a comprehensive schema supporting full-text search, filtering, + sorting, and autocomplete functionality. Includes fields for document + content, metadata, permissions, custom fields, and notes. + + Returns: + Configured Tantivy schema ready for index creation + """ + sb = tantivy.SchemaBuilder() + + sb.add_unsigned_field("id", stored=True, indexed=True, fast=True) + sb.add_text_field("checksum", stored=True, tokenizer_name="raw") + + for field in ( + "title", + "correspondent", + "document_type", + "storage_path", + "original_filename", + "content", + ): + sb.add_text_field(field, stored=True, tokenizer_name="paperless_text") + + # Shadow sort fields - fast, not stored/indexed + for field in ("title_sort", "correspondent_sort", "type_sort"): + sb.add_text_field( + field, + stored=False, + tokenizer_name="simple_analyzer", + fast=True, + ) + + # CJK support - not stored, indexed only + sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer") + + # Autocomplete prefix scan - stored, not indexed + sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw") + + sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text") + + # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice + sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text") + sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text") + + for field in ( + "correspondent_id", + "document_type_id", + "storage_path_id", + "tag_id", + "owner_id", + "viewer_id", + ): + sb.add_unsigned_field(field, stored=False, indexed=True, fast=True) + + for field in ("created", "modified", "added"): + sb.add_date_field(field, stored=True, indexed=True, fast=True) + + for field in ("asn", "page_count", "num_notes"): + sb.add_unsigned_field(field, stored=True, indexed=True, fast=True) + + return sb.build() + + +def needs_rebuild(index_dir: Path) -> bool: + """ + Check if the search index needs rebuilding. + + Compares the current schema version and search language configuration + against sentinel files to determine if the index is compatible with + the current paperless-ngx version and settings. + + Args: + index_dir: Path to the search index directory + + Returns: + True if the index needs rebuilding, False if it's up to date + """ + version_file = index_dir / ".schema_version" + if not version_file.exists(): + return True + try: + if int(version_file.read_text().strip()) != SCHEMA_VERSION: + logger.info("Search index schema version mismatch - rebuilding.") + return True + except ValueError: + return True + + language_file = index_dir / ".schema_language" + if not language_file.exists(): + logger.info("Search index language sentinel missing - rebuilding.") + return True + if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""): + logger.info("Search index language changed - rebuilding.") + return True + + return False + + +def wipe_index(index_dir: Path) -> None: + """ + Delete all contents of the index directory to prepare for rebuild. + + Recursively removes all files and subdirectories within the index + directory while preserving the directory itself. + + Args: + index_dir: Path to the search index directory to clear + """ + for child in index_dir.iterdir(): + if child.is_dir(): + shutil.rmtree(child) + else: + child.unlink() + + +def _write_sentinels(index_dir: Path) -> None: + """Write schema version and language sentinel files so the next index open can skip rebuilding.""" + (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION)) + (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "") + + +def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index: + """ + Open the Tantivy index, creating or rebuilding as needed. + + Checks if the index needs rebuilding due to schema version or language + changes. If rebuilding is needed, wipes the directory and creates a fresh + index with the current schema and configuration. + + Args: + index_dir: Path to index directory (defaults to settings.INDEX_DIR) + + Returns: + Opened Tantivy index (caller must register custom tokenizers) + """ + if index_dir is None: + index_dir = settings.INDEX_DIR + if not index_dir.exists(): + return tantivy.Index(build_schema()) + if needs_rebuild(index_dir): + wipe_index(index_dir) + idx = tantivy.Index(build_schema(), path=str(index_dir)) + _write_sentinels(index_dir) + return idx + return tantivy.Index.open(str(index_dir)) diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py new file mode 100644 index 000000000..e597a879e --- /dev/null +++ b/src/documents/search/_tokenizer.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import logging + +import tantivy + +logger = logging.getLogger("paperless.search") + +# Mapping of ISO 639-1 codes (and common aliases) -> Tantivy Snowball name +_LANGUAGE_MAP: dict[str, str] = { + "ar": "Arabic", + "arabic": "Arabic", + "da": "Danish", + "danish": "Danish", + "nl": "Dutch", + "dutch": "Dutch", + "en": "English", + "english": "English", + "fi": "Finnish", + "finnish": "Finnish", + "fr": "French", + "french": "French", + "de": "German", + "german": "German", + "el": "Greek", + "greek": "Greek", + "hu": "Hungarian", + "hungarian": "Hungarian", + "it": "Italian", + "italian": "Italian", + "no": "Norwegian", + "norwegian": "Norwegian", + "pt": "Portuguese", + "portuguese": "Portuguese", + "ro": "Romanian", + "romanian": "Romanian", + "ru": "Russian", + "russian": "Russian", + "es": "Spanish", + "spanish": "Spanish", + "sv": "Swedish", + "swedish": "Swedish", + "ta": "Tamil", + "tamil": "Tamil", + "tr": "Turkish", + "turkish": "Turkish", +} + +SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) + + +def register_tokenizers(index: tantivy.Index, language: str | None) -> None: + """ + Register all custom tokenizers required by the paperless schema. + + Must be called on every Index instance since Tantivy requires tokenizer + re-registration after each index open/creation. Registers tokenizers for + full-text search, sorting, CJK language support, and fast-field indexing. + + Args: + index: Tantivy index instance to register tokenizers on + language: ISO 639-1 language code for stemming (None to disable) + + Note: + simple_analyzer is registered as both a text and fast-field tokenizer + since sort shadow fields (title_sort, correspondent_sort, type_sort) + use fast=True and Tantivy requires fast-field tokenizers to exist + even for documents that omit those fields. + """ + index.register_tokenizer("paperless_text", _paperless_text(language)) + index.register_tokenizer("simple_analyzer", _simple_analyzer()) + index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + # Fast-field tokenizer required for fast=True text fields in the schema + index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) + + +def _paperless_text(language: str | None) -> tantivy.TextAnalyzer: + """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]""" + builder = ( + tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) + .filter(tantivy.Filter.remove_long(65)) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + ) + if language: + tantivy_lang = _LANGUAGE_MAP.get(language.lower()) + if tantivy_lang: + builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang)) + else: + logger.warning( + "Unsupported search language '%s' - stemming disabled. Supported: %s", + language, + ", ".join(sorted(SUPPORTED_LANGUAGES)), + ) + return builder.build() + + +def _simple_analyzer() -> tantivy.TextAnalyzer: + """Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold.""" + return ( + tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + .build() + ) + + +def _bigram_analyzer() -> tantivy.TextAnalyzer: + """Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support.""" + return ( + tantivy.TextAnalyzerBuilder( + tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False), + ) + .filter(tantivy.Filter.lowercase()) + .build() + ) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index a8beb70c0..9a026ba54 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1293,22 +1293,18 @@ class SearchResultSerializer(DocumentSerializer): documents = self.context.get("documents") # Otherwise we fetch this document. if documents is None: # pragma: no cover - # In practice we only serialize **lists** of whoosh.searching.Hit. - # I'm keeping this check for completeness but marking it no cover for now. + # In practice we only serialize **lists** of SearchHit dicts. + # Keeping this check for completeness but marking it no cover for now. documents = self.fetch_documents([hit["id"]]) document = documents[hit["id"]] - notes = ",".join( - [str(c.note) for c in document.notes.all()], - ) + highlights = hit.get("highlights", {}) r = super().to_representation(document) r["__search_hit__"] = { - "score": hit.score, - "highlights": hit.highlights("content", text=document.content), - "note_highlights": ( - hit.highlights("notes", text=notes) if document else None - ), - "rank": hit.rank, + "score": hit["score"], + "highlights": highlights.get("content", ""), + "note_highlights": highlights.get("notes") or None, + "rank": hit["rank"], } return r diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 82a691696..a72abc2d5 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -790,15 +790,12 @@ def cleanup_user_deletion(sender, instance: User | Group, **kwargs) -> None: def add_to_index(sender, document, **kwargs) -> None: - from documents import index + from documents.search import get_backend - index.add_or_update_document(document) - if document.root_document_id is not None and document.root_document is not None: - # keep in sync when a new version is consumed. - index.add_or_update_document( - document.root_document, - effective_content=document.content, - ) + get_backend().add_or_update( + document, + effective_content=document.get_effective_content(), + ) def run_workflows_added( diff --git a/src/documents/tasks.py b/src/documents/tasks.py index adf1f016c..ae65a5fbe 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -4,11 +4,9 @@ import shutil import uuid import zipfile from collections.abc import Callable -from collections.abc import Iterable from pathlib import Path from tempfile import TemporaryDirectory from tempfile import mkstemp -from typing import TypeVar from celery import Task from celery import shared_task @@ -20,9 +18,7 @@ from django.db import transaction from django.db.models.signals import post_save from django.utils import timezone from filelock import FileLock -from whoosh.writing import AsyncWriter -from documents import index from documents import sanity_checker from documents.barcodes import BarcodePlugin from documents.bulk_download import ArchiveOnlyStrategy @@ -60,7 +56,9 @@ from documents.signals import document_updated from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated +from documents.utils import IterWrapper from documents.utils import compute_checksum +from documents.utils import identity from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig from paperless.parsers import ParserContext @@ -69,34 +67,16 @@ from paperless_ai.indexing import llm_index_add_or_update_document from paperless_ai.indexing import llm_index_remove_document from paperless_ai.indexing import update_llm_index -_T = TypeVar("_T") -IterWrapper = Callable[[Iterable[_T]], Iterable[_T]] - - if settings.AUDIT_LOG_ENABLED: from auditlog.models import LogEntry logger = logging.getLogger("paperless.tasks") -def _identity(iterable: Iterable[_T]) -> Iterable[_T]: - return iterable - - @shared_task def index_optimize() -> None: - ix = index.open_index() - writer = AsyncWriter(ix) - writer.commit(optimize=True) - - -def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None: - documents = Document.objects.all() - - ix = index.open_index(recreate=True) - - with AsyncWriter(ix) as writer: - for document in iter_wrapper(documents): - index.update_document(writer, document) + logger.info( + "index_optimize is a no-op — Tantivy manages segment merging automatically.", + ) @shared_task @@ -270,9 +250,9 @@ def sanity_check(*, scheduled=True, raise_on_error=True): @shared_task def bulk_update_documents(document_ids) -> None: - documents = Document.objects.filter(id__in=document_ids) + from documents.search import get_backend - ix = index.open_index() + documents = Document.objects.filter(id__in=document_ids) for doc in documents: clear_document_caches(doc.pk) @@ -283,9 +263,9 @@ def bulk_update_documents(document_ids) -> None: ) post_save.send(Document, instance=doc, created=False) - with AsyncWriter(ix) as writer: + with get_backend().batch_update() as batch: for doc in documents: - index.update_document(writer, doc) + batch.add_or_update(doc) ai_config = AIConfig() if ai_config.llm_index_enabled: @@ -389,8 +369,9 @@ def update_document_content_maybe_archive_file(document_id) -> None: logger.info( f"Updating index for document {document_id} ({document.archive_checksum})", ) - with index.open_index_writer() as writer: - index.update_document(writer, document) + from documents.search import get_backend + + get_backend().add_or_update(document) ai_config = AIConfig() if ai_config.llm_index_enabled: @@ -633,7 +614,7 @@ def update_document_parent_tags(tag: Tag, new_parent: Tag) -> None: @shared_task def llmindex_index( *, - iter_wrapper: IterWrapper[Document] = _identity, + iter_wrapper: IterWrapper[Document] = identity, rebuild=False, scheduled=True, auto=False, diff --git a/src/documents/tests/conftest.py b/src/documents/tests/conftest.py index 7e75b9194..5cde34768 100644 --- a/src/documents/tests/conftest.py +++ b/src/documents/tests/conftest.py @@ -1,5 +1,6 @@ import shutil import zoneinfo +from collections.abc import Generator from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING @@ -92,6 +93,26 @@ def sample_doc( ) +@pytest.fixture() +def _search_index( + tmp_path: Path, + settings: SettingsWrapper, +) -> Generator[None, None, None]: + """Create a temp index directory and point INDEX_DIR at it. + + Resets the backend singleton before and after so each test gets a clean + index rather than reusing a stale singleton from another test. + """ + from documents.search import reset_backend + + index_dir = tmp_path / "index" + index_dir.mkdir() + settings.INDEX_DIR = index_dir + reset_backend() + yield + reset_backend() + + @pytest.fixture() def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo: return zoneinfo.ZoneInfo(settings.TIME_ZONE) diff --git a/src/documents/tests/search/__init__.py b/src/documents/tests/search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/tests/search/conftest.py b/src/documents/tests/search/conftest.py new file mode 100644 index 000000000..ccc26d695 --- /dev/null +++ b/src/documents/tests/search/conftest.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from documents.search._backend import TantivyBackend +from documents.search._backend import reset_backend + +if TYPE_CHECKING: + from collections.abc import Generator + from pathlib import Path + + from pytest_django.fixtures import SettingsWrapper + + +@pytest.fixture +def index_dir(tmp_path: Path, settings: SettingsWrapper) -> Path: + path = tmp_path / "index" + path.mkdir() + settings.INDEX_DIR = path + return path + + +@pytest.fixture +def backend() -> Generator[TantivyBackend, None, None]: + b = TantivyBackend() # path=None → in-memory index + b.open() + try: + yield b + finally: + b.close() + reset_backend() diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py new file mode 100644 index 000000000..5c92da447 --- /dev/null +++ b/src/documents/tests/search/test_backend.py @@ -0,0 +1,502 @@ +import pytest +from django.contrib.auth.models import User + +from documents.models import CustomField +from documents.models import CustomFieldInstance +from documents.models import Document +from documents.models import Note +from documents.search._backend import TantivyBackend +from documents.search._backend import get_backend +from documents.search._backend import reset_backend + +pytestmark = [pytest.mark.search, pytest.mark.django_db] + + +class TestWriteBatch: + """Test WriteBatch context manager functionality.""" + + def test_rolls_back_on_exception(self, backend: TantivyBackend): + """Batch operations must rollback on exception to preserve index integrity.""" + doc = Document.objects.create( + title="Rollback Target", + content="should survive", + checksum="RB1", + pk=1, + ) + backend.add_or_update(doc) + + try: + with backend.batch_update() as batch: + batch.remove(doc.pk) + raise RuntimeError("simulated failure") + except RuntimeError: + pass + + r = backend.search( + "should survive", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert r.total == 1 + + +class TestSearch: + """Test search functionality.""" + + def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend): + """Search scores must be normalized so top hit has score 1.0 for UI consistency.""" + for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]): + doc = Document.objects.create( + title=title, + content=title, + checksum=f"SN{i}", + pk=10 + i, + ) + backend.add_or_update(doc) + r = backend.search( + "bank", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert r.hits[0]["score"] == pytest.approx(1.0) + assert all(0.0 <= h["score"] <= 1.0 for h in r.hits) + + def test_sort_field_ascending(self, backend: TantivyBackend): + """Searching with sort_reverse=False must return results in ascending ASN order.""" + for asn in [30, 10, 20]: + doc = Document.objects.create( + title="sortable", + content="sortable content", + checksum=f"SFA{asn}", + archive_serial_number=asn, + ) + backend.add_or_update(doc) + + r = backend.search( + "sortable", + user=None, + page=1, + page_size=10, + sort_field="archive_serial_number", + sort_reverse=False, + ) + assert r.total == 3 + asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits] + assert asns == [10, 20, 30] + + def test_sort_field_descending(self, backend: TantivyBackend): + """Searching with sort_reverse=True must return results in descending ASN order.""" + for asn in [30, 10, 20]: + doc = Document.objects.create( + title="sortable", + content="sortable content", + checksum=f"SFD{asn}", + archive_serial_number=asn, + ) + backend.add_or_update(doc) + + r = backend.search( + "sortable", + user=None, + page=1, + page_size=10, + sort_field="archive_serial_number", + sort_reverse=True, + ) + assert r.total == 3 + asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits] + assert asns == [30, 20, 10] + + def test_fuzzy_threshold_filters_low_score_hits( + self, + backend: TantivyBackend, + settings, + ): + """When ADVANCED_FUZZY_SEARCH_THRESHOLD exceeds all normalized scores, hits must be filtered out.""" + doc = Document.objects.create( + title="Invoice document", + content="financial report", + checksum="FT1", + pk=120, + ) + backend.add_or_update(doc) + + # Threshold above 1.0 filters every hit (normalized scores top out at 1.0) + settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1 + r = backend.search( + "invoice", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert r.hits == [] + + def test_owner_filter(self, backend: TantivyBackend): + """Document owners can search their private documents; other users cannot access them.""" + owner = User.objects.create_user("owner") + other = User.objects.create_user("other") + doc = Document.objects.create( + title="Private", + content="secret", + checksum="PF1", + pk=20, + owner=owner, + ) + backend.add_or_update(doc) + + assert ( + backend.search( + "secret", + user=owner, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ).total + == 1 + ) + assert ( + backend.search( + "secret", + user=other, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ).total + == 0 + ) + + +class TestRebuild: + """Test index rebuilding functionality.""" + + def test_with_iter_wrapper_called(self, backend: TantivyBackend): + """Index rebuild must pass documents through iter_wrapper for progress tracking.""" + seen = [] + + def wrapper(docs): + for doc in docs: + seen.append(doc.pk) + yield doc + + Document.objects.create(title="Tracked", content="x", checksum="TW1", pk=30) + backend.rebuild(Document.objects.all(), iter_wrapper=wrapper) + assert 30 in seen + + +class TestAutocomplete: + """Test autocomplete functionality.""" + + def test_basic_functionality(self, backend: TantivyBackend): + """Autocomplete must return words matching the given prefix.""" + doc = Document.objects.create( + title="Invoice from Microsoft Corporation", + content="payment details", + checksum="AC1", + pk=40, + ) + backend.add_or_update(doc) + + results = backend.autocomplete("micro", limit=10) + assert "microsoft" in results + + def test_results_ordered_by_document_frequency(self, backend: TantivyBackend): + """Autocomplete results must be ordered by document frequency to prioritize common terms.""" + # "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should + # return "payment" before "payslip". + for i, (title, checksum) in enumerate( + [ + ("payment invoice", "AF1"), + ("payment receipt", "AF2"), + ("payment confirmation", "AF3"), + ("payslip march", "AF4"), + ], + start=41, + ): + doc = Document.objects.create( + title=title, + content="details", + checksum=checksum, + pk=i, + ) + backend.add_or_update(doc) + + results = backend.autocomplete("pay", limit=10) + assert results.index("payment") < results.index("payslip") + + +class TestMoreLikeThis: + """Test more like this functionality.""" + + def test_excludes_original(self, backend: TantivyBackend): + """More like this queries must exclude the reference document from results.""" + doc1 = Document.objects.create( + title="Important document", + content="financial information", + checksum="MLT1", + pk=50, + ) + doc2 = Document.objects.create( + title="Another document", + content="financial report", + checksum="MLT2", + pk=51, + ) + backend.add_or_update(doc1) + backend.add_or_update(doc2) + + results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10) + returned_ids = [hit["id"] for hit in results.hits] + assert 50 not in returned_ids # Original document excluded + + def test_with_user_applies_permission_filter(self, backend: TantivyBackend): + """more_like_this with a user must exclude documents that user cannot see.""" + viewer = User.objects.create_user("mlt_viewer") + other = User.objects.create_user("mlt_other") + public_doc = Document.objects.create( + title="Public financial document", + content="quarterly financial analysis report figures", + checksum="MLT3", + pk=52, + ) + private_doc = Document.objects.create( + title="Private financial document", + content="quarterly financial analysis report figures", + checksum="MLT4", + pk=53, + owner=other, + ) + backend.add_or_update(public_doc) + backend.add_or_update(private_doc) + + results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10) + returned_ids = [hit["id"] for hit in results.hits] + # private_doc is owned by other, so viewer cannot see it + assert 53 not in returned_ids + + def test_document_not_in_index_returns_empty(self, backend: TantivyBackend): + """more_like_this for a doc_id absent from the index must return empty results.""" + results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10) + assert results.hits == [] + assert results.total == 0 + + +class TestSingleton: + """Test get_backend() and reset_backend() singleton lifecycle.""" + + @pytest.fixture(autouse=True) + def _clean(self): + reset_backend() + yield + reset_backend() + + def test_returns_same_instance_on_repeated_calls(self, index_dir): + """Singleton pattern: repeated calls to get_backend() must return the same instance.""" + assert get_backend() is get_backend() + + def test_reinitializes_when_index_dir_changes(self, tmp_path, settings): + """Backend singleton must reinitialize when INDEX_DIR setting changes for test isolation.""" + settings.INDEX_DIR = tmp_path / "a" + (tmp_path / "a").mkdir() + b1 = get_backend() + + settings.INDEX_DIR = tmp_path / "b" + (tmp_path / "b").mkdir() + b2 = get_backend() + + assert b1 is not b2 + assert b2._path == tmp_path / "b" + + def test_reset_forces_new_instance(self, index_dir): + """reset_backend() must force creation of a new backend instance on next get_backend() call.""" + b1 = get_backend() + reset_backend() + b2 = get_backend() + assert b1 is not b2 + + +class TestFieldHandling: + """Test handling of various document fields.""" + + def test_none_values_handled_correctly(self, backend: TantivyBackend): + """Document fields with None values must not cause indexing errors.""" + doc = Document.objects.create( + title="Test Doc", + content="test content", + checksum="NV1", + pk=60, + original_filename=None, + page_count=None, + ) + # Should not raise an exception + backend.add_or_update(doc) + + results = backend.search( + "test", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 1 + + def test_custom_fields_include_name_and_value(self, backend: TantivyBackend): + """Custom fields must be indexed with both field name and value for structured queries.""" + # Create a custom field + field = CustomField.objects.create( + name="Invoice Number", + data_type=CustomField.FieldDataType.STRING, + ) + doc = Document.objects.create( + title="Invoice", + content="test", + checksum="CF1", + pk=70, + ) + CustomFieldInstance.objects.create( + document=doc, + field=field, + value_text="INV-2024-001", + ) + + # Should not raise an exception during indexing + backend.add_or_update(doc) + + results = backend.search( + "invoice", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 1 + + def test_select_custom_field_indexes_label_not_id(self, backend: TantivyBackend): + """SELECT custom fields must index the human-readable label, not the opaque option ID.""" + field = CustomField.objects.create( + name="Category", + data_type=CustomField.FieldDataType.SELECT, + extra_data={ + "select_options": [ + {"id": "opt_abc", "label": "Invoice"}, + {"id": "opt_def", "label": "Receipt"}, + ], + }, + ) + doc = Document.objects.create( + title="Categorised doc", + content="test", + checksum="SEL1", + pk=71, + ) + CustomFieldInstance.objects.create( + document=doc, + field=field, + value_select="opt_abc", + ) + backend.add_or_update(doc) + + # Label should be findable + results = backend.search( + "custom_fields.value:invoice", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 1 + + # Opaque ID must not appear in the index + results = backend.search( + "custom_fields.value:opt_abc", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 0 + + def test_none_custom_field_value_not_indexed(self, backend: TantivyBackend): + """Custom field instances with no value set must not produce an index entry.""" + field = CustomField.objects.create( + name="Optional", + data_type=CustomField.FieldDataType.SELECT, + extra_data={"select_options": [{"id": "opt_1", "label": "Yes"}]}, + ) + doc = Document.objects.create( + title="Unset field doc", + content="test", + checksum="SEL2", + pk=72, + ) + CustomFieldInstance.objects.create( + document=doc, + field=field, + value_select=None, + ) + backend.add_or_update(doc) + + # The string "none" must not appear as an indexed value + results = backend.search( + "custom_fields.value:none", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 0 + + def test_notes_include_user_information(self, backend: TantivyBackend): + """Notes must be indexed with user information when available for structured queries.""" + user = User.objects.create_user("notewriter") + doc = Document.objects.create( + title="Doc with notes", + content="test", + checksum="NT1", + pk=80, + ) + Note.objects.create(document=doc, note="Important note", user=user) + + # Should not raise an exception during indexing + backend.add_or_update(doc) + + # Test basic document search first + results = backend.search( + "test", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 1, ( + f"Expected 1, got {results.total}. Document content should be searchable." + ) + + # Test notes search — must use structured JSON syntax now that note + # is no longer in DEFAULT_SEARCH_FIELDS + results = backend.search( + "notes.note:important", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + ) + assert results.total == 1, ( + f"Expected 1, got {results.total}. Note content should be searchable via notes.note: prefix." + ) diff --git a/src/documents/tests/search/test_migration_fulltext_query_field_prefixes.py b/src/documents/tests/search/test_migration_fulltext_query_field_prefixes.py new file mode 100644 index 000000000..739ea996c --- /dev/null +++ b/src/documents/tests/search/test_migration_fulltext_query_field_prefixes.py @@ -0,0 +1,138 @@ +import pytest + +from documents.tests.utils import TestMigrations + +pytestmark = pytest.mark.search + + +class TestMigrateFulltextQueryFieldPrefixes(TestMigrations): + migrate_from = "0016_sha256_checksums" + migrate_to = "0017_migrate_fulltext_query_field_prefixes" + + def setUpBeforeMigration(self, apps) -> None: + User = apps.get_model("auth", "User") + SavedView = apps.get_model("documents", "SavedView") + SavedViewFilterRule = apps.get_model("documents", "SavedViewFilterRule") + + user = User.objects.create(username="testuser") + + def make_rule(value: str): + view = SavedView.objects.create( + owner=user, + name=f"view-{value}", + sort_field="created", + ) + return SavedViewFilterRule.objects.create( + saved_view=view, + rule_type=20, # fulltext query + value=value, + ) + + # Simple field prefixes + self.rule_note = make_rule("note:invoice") + self.rule_cf = make_rule("custom_field:amount") + + # Combined query + self.rule_combined = make_rule("note:invoice AND custom_field:total") + + # Parenthesized groups (Whoosh syntax) + self.rule_parens = make_rule("(note:invoice OR note:receipt)") + + # Prefix operators + self.rule_plus = make_rule("+note:foo") + self.rule_minus = make_rule("-note:bar") + + # Boosted + self.rule_boost = make_rule("note:test^2") + + # Should NOT be rewritten — no field prefix match + self.rule_no_match = make_rule("title:hello content:world") + + # Should NOT false-positive on word boundaries + self.rule_denote = make_rule("denote:foo") + + # Already using new syntax — should be idempotent + self.rule_already_migrated = make_rule("notes.note:foo") + self.rule_already_migrated_cf = make_rule("custom_fields.value:bar") + + # Null value — should not crash + view_null = SavedView.objects.create( + owner=user, + name="view-null", + sort_field="created", + ) + self.rule_null = SavedViewFilterRule.objects.create( + saved_view=view_null, + rule_type=20, + value=None, + ) + + # Non-fulltext rule type — should be untouched + view_other = SavedView.objects.create( + owner=user, + name="view-other-type", + sort_field="created", + ) + self.rule_other_type = SavedViewFilterRule.objects.create( + saved_view=view_other, + rule_type=0, # title contains + value="note:something", + ) + + def test_note_prefix_rewritten(self): + self.rule_note.refresh_from_db() + self.assertEqual(self.rule_note.value, "notes.note:invoice") + + def test_custom_field_prefix_rewritten(self): + self.rule_cf.refresh_from_db() + self.assertEqual(self.rule_cf.value, "custom_fields.value:amount") + + def test_combined_query_rewritten(self): + self.rule_combined.refresh_from_db() + self.assertEqual( + self.rule_combined.value, + "notes.note:invoice AND custom_fields.value:total", + ) + + def test_parenthesized_groups(self): + self.rule_parens.refresh_from_db() + self.assertEqual( + self.rule_parens.value, + "(notes.note:invoice OR notes.note:receipt)", + ) + + def test_plus_prefix(self): + self.rule_plus.refresh_from_db() + self.assertEqual(self.rule_plus.value, "+notes.note:foo") + + def test_minus_prefix(self): + self.rule_minus.refresh_from_db() + self.assertEqual(self.rule_minus.value, "-notes.note:bar") + + def test_boosted(self): + self.rule_boost.refresh_from_db() + self.assertEqual(self.rule_boost.value, "notes.note:test^2") + + def test_no_match_unchanged(self): + self.rule_no_match.refresh_from_db() + self.assertEqual(self.rule_no_match.value, "title:hello content:world") + + def test_word_boundary_no_false_positive(self): + self.rule_denote.refresh_from_db() + self.assertEqual(self.rule_denote.value, "denote:foo") + + def test_already_migrated_idempotent(self): + self.rule_already_migrated.refresh_from_db() + self.assertEqual(self.rule_already_migrated.value, "notes.note:foo") + + def test_already_migrated_cf_idempotent(self): + self.rule_already_migrated_cf.refresh_from_db() + self.assertEqual(self.rule_already_migrated_cf.value, "custom_fields.value:bar") + + def test_null_value_no_crash(self): + self.rule_null.refresh_from_db() + self.assertIsNone(self.rule_null.value) + + def test_non_fulltext_rule_untouched(self): + self.rule_other_type.refresh_from_db() + self.assertEqual(self.rule_other_type.value, "note:something") diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py new file mode 100644 index 000000000..74a064dbb --- /dev/null +++ b/src/documents/tests/search/test_query.py @@ -0,0 +1,530 @@ +from __future__ import annotations + +import re +from datetime import UTC +from datetime import datetime +from datetime import tzinfo +from typing import TYPE_CHECKING +from zoneinfo import ZoneInfo + +import pytest +import tantivy +import time_machine + +from documents.search._query import _date_only_range +from documents.search._query import _datetime_range +from documents.search._query import _rewrite_compact_date +from documents.search._query import build_permission_filter +from documents.search._query import normalize_query +from documents.search._query import parse_user_query +from documents.search._query import rewrite_natural_date_keywords +from documents.search._schema import build_schema +from documents.search._tokenizer import register_tokenizers + +if TYPE_CHECKING: + from django.contrib.auth.base_user import AbstractBaseUser + +pytestmark = pytest.mark.search + +EASTERN = ZoneInfo("America/New_York") # UTC-5 / UTC-4 (DST) +AUCKLAND = ZoneInfo("Pacific/Auckland") # UTC+13 in southern-hemisphere summer + + +def _range(result: str, field: str) -> tuple[str, str]: + m = re.search(rf"{field}:\[(.+?) TO (.+?)\]", result) + assert m, f"No range for {field!r} in: {result!r}" + return m.group(1), m.group(2) + + +class TestCreatedDateField: + """ + created is a Django DateField: indexed as midnight UTC of the local calendar + date. No offset arithmetic needed - the local calendar date is what matters. + """ + + @pytest.mark.parametrize( + ("tz", "expected_lo", "expected_hi"), + [ + pytest.param(UTC, "2026-03-28T00:00:00Z", "2026-03-29T00:00:00Z", id="utc"), + pytest.param( + EASTERN, + "2026-03-28T00:00:00Z", + "2026-03-29T00:00:00Z", + id="eastern_same_calendar_date", + ), + ], + ) + @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False) + def test_today(self, tz: tzinfo, expected_lo: str, expected_hi: str) -> None: + lo, hi = _range(rewrite_natural_date_keywords("created:today", tz), "created") + assert lo == expected_lo + assert hi == expected_hi + + @time_machine.travel(datetime(2026, 3, 28, 3, 0, tzinfo=UTC), tick=False) + def test_today_auckland_ahead_of_utc(self) -> None: + # UTC 03:00 -> Auckland (UTC+13) = 16:00 same date; local date = 2026-03-28 + lo, _ = _range( + rewrite_natural_date_keywords("created:today", AUCKLAND), + "created", + ) + assert lo == "2026-03-28T00:00:00Z" + + @pytest.mark.parametrize( + ("field", "keyword", "expected_lo", "expected_hi"), + [ + pytest.param( + "created", + "yesterday", + "2026-03-27T00:00:00Z", + "2026-03-28T00:00:00Z", + id="yesterday", + ), + pytest.param( + "created", + "this_week", + "2026-03-23T00:00:00Z", + "2026-03-30T00:00:00Z", + id="this_week_mon_sun", + ), + pytest.param( + "created", + "last_week", + "2026-03-16T00:00:00Z", + "2026-03-23T00:00:00Z", + id="last_week", + ), + pytest.param( + "created", + "this_month", + "2026-03-01T00:00:00Z", + "2026-04-01T00:00:00Z", + id="this_month", + ), + pytest.param( + "created", + "last_month", + "2026-02-01T00:00:00Z", + "2026-03-01T00:00:00Z", + id="last_month", + ), + pytest.param( + "created", + "this_year", + "2026-01-01T00:00:00Z", + "2027-01-01T00:00:00Z", + id="this_year", + ), + pytest.param( + "created", + "last_year", + "2025-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + id="last_year", + ), + ], + ) + @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) + def test_date_keywords( + self, + field: str, + keyword: str, + expected_lo: str, + expected_hi: str, + ) -> None: + # 2026-03-28 is Saturday; Mon-Sun week calculation built into expectations + query = f"{field}:{keyword}" + lo, hi = _range(rewrite_natural_date_keywords(query, UTC), field) + assert lo == expected_lo + assert hi == expected_hi + + @time_machine.travel(datetime(2026, 12, 15, 12, 0, tzinfo=UTC), tick=False) + def test_this_month_december_wraps_to_next_year(self) -> None: + # December: next month must roll over to January 1 of next year + lo, hi = _range( + rewrite_natural_date_keywords("created:this_month", UTC), + "created", + ) + assert lo == "2026-12-01T00:00:00Z" + assert hi == "2027-01-01T00:00:00Z" + + @time_machine.travel(datetime(2026, 1, 15, 12, 0, tzinfo=UTC), tick=False) + def test_last_month_january_wraps_to_previous_year(self) -> None: + # January: last month must roll back to December 1 of previous year + lo, hi = _range( + rewrite_natural_date_keywords("created:last_month", UTC), + "created", + ) + assert lo == "2025-12-01T00:00:00Z" + assert hi == "2026-01-01T00:00:00Z" + + def test_unknown_keyword_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown keyword"): + _date_only_range("bogus_keyword", UTC) + + +class TestDateTimeFields: + """ + added/modified store full UTC datetimes. Natural keywords must convert + the local day boundaries to UTC - timezone offset arithmetic IS required. + """ + + @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False) + def test_added_today_eastern(self) -> None: + # EDT = UTC-4; local midnight 2026-03-28 00:00 EDT = 2026-03-28 04:00 UTC + lo, hi = _range(rewrite_natural_date_keywords("added:today", EASTERN), "added") + assert lo == "2026-03-28T04:00:00Z" + assert hi == "2026-03-29T04:00:00Z" + + @time_machine.travel(datetime(2026, 3, 29, 2, 0, tzinfo=UTC), tick=False) + def test_added_today_auckland_midnight_crossing(self) -> None: + # UTC 02:00 on 2026-03-29 -> Auckland (UTC+13) = 2026-03-29 15:00 local + # Auckland midnight = UTC 2026-03-28 11:00 + lo, hi = _range(rewrite_natural_date_keywords("added:today", AUCKLAND), "added") + assert lo == "2026-03-28T11:00:00Z" + assert hi == "2026-03-29T11:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) + def test_modified_today_utc(self) -> None: + lo, hi = _range( + rewrite_natural_date_keywords("modified:today", UTC), + "modified", + ) + assert lo == "2026-03-28T00:00:00Z" + assert hi == "2026-03-29T00:00:00Z" + + @pytest.mark.parametrize( + ("keyword", "expected_lo", "expected_hi"), + [ + pytest.param( + "yesterday", + "2026-03-27T00:00:00Z", + "2026-03-28T00:00:00Z", + id="yesterday", + ), + pytest.param( + "this_week", + "2026-03-23T00:00:00Z", + "2026-03-30T00:00:00Z", + id="this_week", + ), + pytest.param( + "last_week", + "2026-03-16T00:00:00Z", + "2026-03-23T00:00:00Z", + id="last_week", + ), + pytest.param( + "this_month", + "2026-03-01T00:00:00Z", + "2026-04-01T00:00:00Z", + id="this_month", + ), + pytest.param( + "last_month", + "2026-02-01T00:00:00Z", + "2026-03-01T00:00:00Z", + id="last_month", + ), + pytest.param( + "this_year", + "2026-01-01T00:00:00Z", + "2027-01-01T00:00:00Z", + id="this_year", + ), + pytest.param( + "last_year", + "2025-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + id="last_year", + ), + ], + ) + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_datetime_keywords_utc( + self, + keyword: str, + expected_lo: str, + expected_hi: str, + ) -> None: + # 2026-03-28 is Saturday; weekday()==5 so Monday=2026-03-23 + lo, hi = _range(rewrite_natural_date_keywords(f"added:{keyword}", UTC), "added") + assert lo == expected_lo + assert hi == expected_hi + + @time_machine.travel(datetime(2026, 12, 15, 12, 0, tzinfo=UTC), tick=False) + def test_this_month_december_wraps_to_next_year(self) -> None: + # December: next month wraps to January of next year + lo, hi = _range(rewrite_natural_date_keywords("added:this_month", UTC), "added") + assert lo == "2026-12-01T00:00:00Z" + assert hi == "2027-01-01T00:00:00Z" + + @time_machine.travel(datetime(2026, 1, 15, 12, 0, tzinfo=UTC), tick=False) + def test_last_month_january_wraps_to_previous_year(self) -> None: + # January: last month wraps back to December of previous year + lo, hi = _range(rewrite_natural_date_keywords("added:last_month", UTC), "added") + assert lo == "2025-12-01T00:00:00Z" + assert hi == "2026-01-01T00:00:00Z" + + def test_unknown_keyword_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown keyword"): + _datetime_range("bogus_keyword", UTC) + + +class TestWhooshQueryRewriting: + """All Whoosh query syntax variants must be rewritten to ISO 8601 before Tantivy parses them.""" + + @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) + def test_compact_date_shim_rewrites_to_iso(self) -> None: + result = rewrite_natural_date_keywords("created:20240115120000", UTC) + assert "2024-01-15" in result + assert "20240115120000" not in result + + @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) + def test_relative_range_shim_removes_now(self) -> None: + result = rewrite_natural_date_keywords("added:[now-7d TO now]", UTC) + assert "now" not in result + assert "2026-03-" in result + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_minus_7_days(self) -> None: + lo, hi = _range( + rewrite_natural_date_keywords("added:[-7 days to now]", UTC), + "added", + ) + assert lo == "2026-03-21T12:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_minus_1_week(self) -> None: + lo, hi = _range( + rewrite_natural_date_keywords("added:[-1 week to now]", UTC), + "added", + ) + assert lo == "2026-03-21T12:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_minus_1_month_uses_relativedelta(self) -> None: + # relativedelta(months=1) from 2026-03-28 = 2026-02-28 (not 29) + lo, hi = _range( + rewrite_natural_date_keywords("created:[-1 month to now]", UTC), + "created", + ) + assert lo == "2026-02-28T12:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_minus_1_year(self) -> None: + lo, hi = _range( + rewrite_natural_date_keywords("modified:[-1 year to now]", UTC), + "modified", + ) + assert lo == "2025-03-28T12:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_plural_unit_hours(self) -> None: + lo, hi = _range( + rewrite_natural_date_keywords("added:[-3 hours to now]", UTC), + "added", + ) + assert lo == "2026-03-28T09:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_bracket_case_insensitive(self) -> None: + result = rewrite_natural_date_keywords("added:[-1 WEEK TO NOW]", UTC) + assert "now" not in result.lower() + lo, hi = _range(result, "added") + assert lo == "2026-03-21T12:00:00Z" + assert hi == "2026-03-28T12:00:00Z" + + @time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False) + def test_relative_range_swaps_bounds_when_lo_exceeds_hi(self) -> None: + # [now+1h TO now-1h] has lo > hi before substitution; they must be swapped + lo, hi = _range( + rewrite_natural_date_keywords("added:[now+1h TO now-1h]", UTC), + "added", + ) + assert lo == "2026-03-28T11:00:00Z" + assert hi == "2026-03-28T13:00:00Z" + + def test_8digit_created_date_field_always_uses_utc_midnight(self) -> None: + # created is a DateField: boundaries are always UTC midnight, no TZ offset + result = rewrite_natural_date_keywords("created:20231201", EASTERN) + lo, hi = _range(result, "created") + assert lo == "2023-12-01T00:00:00Z" + assert hi == "2023-12-02T00:00:00Z" + + def test_8digit_added_datetime_field_converts_local_midnight_to_utc(self) -> None: + # added is DateTimeField: midnight Dec 1 Eastern (EST = UTC-5) = 05:00 UTC + result = rewrite_natural_date_keywords("added:20231201", EASTERN) + lo, hi = _range(result, "added") + assert lo == "2023-12-01T05:00:00Z" + assert hi == "2023-12-02T05:00:00Z" + + def test_8digit_modified_datetime_field_converts_local_midnight_to_utc( + self, + ) -> None: + result = rewrite_natural_date_keywords("modified:20231201", EASTERN) + lo, hi = _range(result, "modified") + assert lo == "2023-12-01T05:00:00Z" + assert hi == "2023-12-02T05:00:00Z" + + def test_8digit_invalid_date_passes_through_unchanged(self) -> None: + assert rewrite_natural_date_keywords("added:20231340", UTC) == "added:20231340" + + def test_compact_14digit_invalid_date_passes_through_unchanged(self) -> None: + # Month=13 makes datetime() raise ValueError; the token must be left as-is + assert _rewrite_compact_date("20231300120000") == "20231300120000" + + +class TestParseUserQuery: + """parse_user_query runs the full preprocessing pipeline.""" + + @pytest.fixture + def query_index(self) -> tantivy.Index: + schema = build_schema() + idx = tantivy.Index(schema, path=None) + register_tokenizers(idx, "") + return idx + + def test_returns_tantivy_query(self, query_index: tantivy.Index) -> None: + assert isinstance(parse_user_query(query_index, "invoice", UTC), tantivy.Query) + + def test_fuzzy_mode_does_not_raise( + self, + query_index: tantivy.Index, + settings, + ) -> None: + settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 0.5 + assert isinstance(parse_user_query(query_index, "invoice", UTC), tantivy.Query) + + def test_date_rewriting_applied_before_tantivy_parse( + self, + query_index: tantivy.Index, + ) -> None: + # created:today must be rewritten to an ISO range before Tantivy parses it; + # if passed raw, Tantivy would reject "today" as an invalid date value + with time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False): + q = parse_user_query(query_index, "created:today", UTC) + assert isinstance(q, tantivy.Query) + + +class TestPassthrough: + """Queries without field prefixes or unrelated content pass through unchanged.""" + + def test_bare_keyword_no_field_prefix_unchanged(self) -> None: + # Bare 'today' with no field: prefix passes through unchanged + result = rewrite_natural_date_keywords("bank statement today", UTC) + assert "today" in result + + def test_unrelated_query_unchanged(self) -> None: + assert rewrite_natural_date_keywords("title:invoice", UTC) == "title:invoice" + + +class TestNormalizeQuery: + """normalize_query expands comma-separated values and collapses whitespace.""" + + def test_normalize_expands_comma_separated_tags(self) -> None: + assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar" + + def test_normalize_expands_three_values(self) -> None: + assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz" + + def test_normalize_collapses_whitespace(self) -> None: + assert normalize_query("bank statement") == "bank statement" + + def test_normalize_no_commas_unchanged(self) -> None: + assert normalize_query("bank statement") == "bank statement" + + +class TestPermissionFilter: + """ + build_permission_filter tests use an in-memory index — no DB access needed. + + Users are constructed as unsaved model instances (django_user_model(pk=N)) + so no database round-trip occurs; only .pk is read by build_permission_filter. + """ + + @pytest.fixture + def perm_index(self) -> tantivy.Index: + schema = build_schema() + idx = tantivy.Index(schema, path=None) + register_tokenizers(idx, "") + return idx + + def _add_doc( + self, + idx: tantivy.Index, + doc_id: int, + owner_id: int | None = None, + viewer_ids: tuple[int, ...] = (), + ) -> None: + writer = idx.writer() + doc = tantivy.Document() + doc.add_unsigned("id", doc_id) + # Only add owner_id field if the document has an owner + if owner_id is not None: + doc.add_unsigned("owner_id", owner_id) + for vid in viewer_ids: + doc.add_unsigned("viewer_id", vid) + writer.add_document(doc) + writer.commit() + idx.reload() + + def test_perm_no_owner_visible_to_any_user( + self, + perm_index: tantivy.Index, + django_user_model: type[AbstractBaseUser], + ) -> None: + """Documents with no owner must be visible to every user.""" + self._add_doc(perm_index, doc_id=1, owner_id=None) + user = django_user_model(pk=99) + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_owned_by_user_is_visible( + self, + perm_index: tantivy.Index, + django_user_model: type[AbstractBaseUser], + ) -> None: + """A document owned by the requesting user must be visible.""" + self._add_doc(perm_index, doc_id=2, owner_id=42) + user = django_user_model(pk=42) + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_owned_by_other_not_visible( + self, + perm_index: tantivy.Index, + django_user_model: type[AbstractBaseUser], + ) -> None: + """A document owned by a different user must not be visible.""" + self._add_doc(perm_index, doc_id=3, owner_id=42) + user = django_user_model(pk=99) + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 0 + + def test_perm_shared_viewer_is_visible( + self, + perm_index: tantivy.Index, + django_user_model: type[AbstractBaseUser], + ) -> None: + """A document explicitly shared with a user must be visible to that user.""" + self._add_doc(perm_index, doc_id=4, owner_id=42, viewer_ids=(99,)) + user = django_user_model(pk=99) + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_only_owned_docs_hidden_from_others( + self, + perm_index: tantivy.Index, + django_user_model: type[AbstractBaseUser], + ) -> None: + """Only unowned documents appear when the user owns none of them.""" + self._add_doc(perm_index, doc_id=5, owner_id=10) # owned by 10 + self._add_doc(perm_index, doc_id=6, owner_id=None) # unowned + user = django_user_model(pk=20) + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 # only unowned diff --git a/src/documents/tests/search/test_schema.py b/src/documents/tests/search/test_schema.py new file mode 100644 index 000000000..1ff9bee32 --- /dev/null +++ b/src/documents/tests/search/test_schema.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from documents.search._schema import SCHEMA_VERSION +from documents.search._schema import needs_rebuild + +if TYPE_CHECKING: + from pathlib import Path + + from pytest_django.fixtures import SettingsWrapper + +pytestmark = pytest.mark.search + + +class TestNeedsRebuild: + """needs_rebuild covers all sentinel-file states that require a full reindex.""" + + def test_returns_true_when_version_file_missing(self, index_dir: Path) -> None: + assert needs_rebuild(index_dir) is True + + def test_returns_false_when_version_and_language_match( + self, + index_dir: Path, + settings: SettingsWrapper, + ) -> None: + settings.SEARCH_LANGUAGE = "en" + (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION)) + (index_dir / ".schema_language").write_text("en") + assert needs_rebuild(index_dir) is False + + def test_returns_true_on_schema_version_mismatch(self, index_dir: Path) -> None: + (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION - 1)) + assert needs_rebuild(index_dir) is True + + def test_returns_true_when_version_file_not_an_integer( + self, + index_dir: Path, + ) -> None: + (index_dir / ".schema_version").write_text("not-a-number") + assert needs_rebuild(index_dir) is True + + def test_returns_true_when_language_sentinel_missing( + self, + index_dir: Path, + settings: SettingsWrapper, + ) -> None: + settings.SEARCH_LANGUAGE = "en" + (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION)) + # .schema_language intentionally absent + assert needs_rebuild(index_dir) is True + + def test_returns_true_when_language_sentinel_content_differs( + self, + index_dir: Path, + settings: SettingsWrapper, + ) -> None: + settings.SEARCH_LANGUAGE = "de" + (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION)) + (index_dir / ".schema_language").write_text("en") + assert needs_rebuild(index_dir) is True diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py new file mode 100644 index 000000000..aee52a567 --- /dev/null +++ b/src/documents/tests/search/test_tokenizer.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import pytest +import tantivy + +from documents.search._tokenizer import _bigram_analyzer +from documents.search._tokenizer import _paperless_text +from documents.search._tokenizer import register_tokenizers + +if TYPE_CHECKING: + from _pytest.logging import LogCaptureFixture + +pytestmark = pytest.mark.search + + +class TestTokenizers: + @pytest.fixture + def content_index(self) -> tantivy.Index: + """Index with just a content field for ASCII folding tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field("content", stored=True, tokenizer_name="paperless_text") + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("paperless_text", _paperless_text("")) + return idx + + @pytest.fixture + def bigram_index(self) -> tantivy.Index: + """Index with bigram field for CJK tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field( + "bigram_content", + stored=False, + tokenizer_name="bigram_analyzer", + ) + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + return idx + + def test_ascii_fold_finds_accented_content( + self, + content_index: tantivy.Index, + ) -> None: + """ASCII folding allows searching accented text with plain ASCII queries.""" + writer = content_index.writer() + doc = tantivy.Document() + doc.add_text("content", "café résumé") + writer.add_document(doc) + writer.commit() + content_index.reload() + q = content_index.parse_query("cafe resume", ["content"]) + assert content_index.searcher().search(q, limit=5).count == 1 + + def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None: + """Bigram tokenizer enables substring search in CJK languages without whitespace delimiters.""" + writer = bigram_index.writer() + doc = tantivy.Document() + doc.add_text("bigram_content", "東京都") + writer.add_document(doc) + writer.commit() + bigram_index.reload() + q = bigram_index.parse_query("東京", ["bigram_content"]) + assert bigram_index.searcher().search(q, limit=5).count == 1 + + def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: + """Unsupported language codes should log a warning and disable stemming gracefully.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field("content", stored=True, tokenizer_name="paperless_text") + schema = sb.build() + idx = tantivy.Index(schema, path=None) + + with caplog.at_level(logging.WARNING, logger="paperless.search"): + register_tokenizers(idx, "klingon") + assert "klingon" in caplog.text diff --git a/src/documents/tests/test_admin.py b/src/documents/tests/test_admin.py index de2f07df5..533319c2f 100644 --- a/src/documents/tests/test_admin.py +++ b/src/documents/tests/test_admin.py @@ -1,6 +1,7 @@ import types from unittest.mock import patch +import tantivy from django.contrib.admin.sites import AdminSite from django.contrib.auth.models import Permission from django.contrib.auth.models import User @@ -8,36 +9,54 @@ from django.test import TestCase from django.utils import timezone from rest_framework import status -from documents import index from documents.admin import DocumentAdmin from documents.admin import TagAdmin from documents.models import Document from documents.models import Tag +from documents.search import get_backend +from documents.search import reset_backend from documents.tests.utils import DirectoriesMixin from paperless.admin import PaperlessUserAdmin class TestDocumentAdmin(DirectoriesMixin, TestCase): def get_document_from_index(self, doc): - ix = index.open_index() - with ix.searcher() as searcher: - return searcher.document(id=doc.id) + backend = get_backend() + searcher = backend._index.searcher() + results = searcher.search( + tantivy.Query.range_query( + backend._schema, + "id", + tantivy.FieldType.Unsigned, + doc.pk, + doc.pk, + ), + limit=1, + ) + if results.hits: + return searcher.doc(results.hits[0][1]).to_dict() + return None def setUp(self) -> None: super().setUp() + reset_backend() self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite()) + def tearDown(self) -> None: + reset_backend() + super().tearDown() + def test_save_model(self) -> None: doc = Document.objects.create(title="test") doc.title = "new title" self.doc_admin.save_model(None, doc, None, None) self.assertEqual(Document.objects.get(id=doc.id).title, "new title") - self.assertEqual(self.get_document_from_index(doc)["id"], doc.id) + self.assertEqual(self.get_document_from_index(doc)["id"], [doc.id]) def test_delete_model(self) -> None: doc = Document.objects.create(title="test") - index.add_or_update_document(doc) + get_backend().add_or_update(doc) self.assertIsNotNone(self.get_document_from_index(doc)) self.doc_admin.delete_model(None, doc) @@ -53,7 +72,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase): checksum=f"{i:02}", ) docs.append(doc) - index.add_or_update_document(doc) + get_backend().add_or_update(doc) self.assertEqual(Document.objects.count(), 42) diff --git a/src/documents/tests/test_api_document_versions.py b/src/documents/tests/test_api_document_versions.py index f5c1a7346..d95e78fe9 100644 --- a/src/documents/tests/test_api_document_versions.py +++ b/src/documents/tests/test_api_document_versions.py @@ -109,7 +109,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): mime_type="application/pdf", ) - with mock.patch("documents.index.remove_document_from_index"): + with mock.patch("documents.search.get_backend"): resp = self.client.delete(f"/api/documents/{root.id}/versions/{root.id}/") self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST) @@ -137,10 +137,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): content="v2-content", ) - with ( - mock.patch("documents.index.remove_document_from_index"), - mock.patch("documents.index.add_or_update_document"), - ): + with mock.patch("documents.search.get_backend"): resp = self.client.delete(f"/api/documents/{root.id}/versions/{v2.id}/") self.assertEqual(resp.status_code, status.HTTP_200_OK) @@ -149,10 +146,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): root.refresh_from_db() self.assertEqual(root.content, "root-content") - with ( - mock.patch("documents.index.remove_document_from_index"), - mock.patch("documents.index.add_or_update_document"), - ): + with mock.patch("documents.search.get_backend"): resp = self.client.delete(f"/api/documents/{root.id}/versions/{v1.id}/") self.assertEqual(resp.status_code, status.HTTP_200_OK) @@ -175,10 +169,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): ) version_id = version.id - with ( - mock.patch("documents.index.remove_document_from_index"), - mock.patch("documents.index.add_or_update_document"), - ): + with mock.patch("documents.search.get_backend"): resp = self.client.delete( f"/api/documents/{root.id}/versions/{version_id}/", ) @@ -225,7 +216,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): root_document=other_root, ) - with mock.patch("documents.index.remove_document_from_index"): + with mock.patch("documents.search.get_backend"): resp = self.client.delete( f"/api/documents/{root.id}/versions/{other_version.id}/", ) @@ -245,10 +236,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): root_document=root, ) - with ( - mock.patch("documents.index.remove_document_from_index"), - mock.patch("documents.index.add_or_update_document"), - ): + with mock.patch("documents.search.get_backend"): resp = self.client.delete( f"/api/documents/{version.id}/versions/{version.id}/", ) @@ -275,18 +263,17 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase): root_document=root, ) - with ( - mock.patch("documents.index.remove_document_from_index") as remove_index, - mock.patch("documents.index.add_or_update_document") as add_or_update, - ): + with mock.patch("documents.search.get_backend") as mock_get_backend: + mock_backend = mock.MagicMock() + mock_get_backend.return_value = mock_backend resp = self.client.delete( f"/api/documents/{root.id}/versions/{version.id}/", ) self.assertEqual(resp.status_code, status.HTTP_200_OK) - remove_index.assert_called_once_with(version) - add_or_update.assert_called_once() - self.assertEqual(add_or_update.call_args[0][0].id, root.id) + mock_backend.remove.assert_called_once_with(version.pk) + mock_backend.add_or_update.assert_called_once() + self.assertEqual(mock_backend.add_or_update.call_args[0][0].id, root.id) def test_delete_version_returns_403_without_permission(self) -> None: owner = User.objects.create_user(username="owner") diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 546dff233..69bd65198 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -2,6 +2,7 @@ import datetime from datetime import timedelta from unittest import mock +import pytest from dateutil.relativedelta import relativedelta from django.contrib.auth.models import Group from django.contrib.auth.models import Permission @@ -11,9 +12,7 @@ from django.utils import timezone from guardian.shortcuts import assign_perm from rest_framework import status from rest_framework.test import APITestCase -from whoosh.writing import AsyncWriter -from documents import index from documents.bulk_edit import set_permissions from documents.models import Correspondent from documents.models import CustomField @@ -25,18 +24,27 @@ from documents.models import SavedView from documents.models import StoragePath from documents.models import Tag from documents.models import Workflow +from documents.search import get_backend +from documents.search import reset_backend from documents.tests.utils import DirectoriesMixin from paperless_mail.models import MailAccount from paperless_mail.models import MailRule +pytestmark = pytest.mark.search + class TestDocumentSearchApi(DirectoriesMixin, APITestCase): def setUp(self) -> None: super().setUp() + reset_backend() self.user = User.objects.create_superuser(username="temp_admin") self.client.force_authenticate(user=self.user) + def tearDown(self) -> None: + reset_backend() + super().tearDown() + def test_search(self) -> None: d1 = Document.objects.create( title="invoice", @@ -57,13 +65,11 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): checksum="C", original_filename="someepdf.pdf", ) - with AsyncWriter(index.open_index()) as writer: - # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once - # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer. - # That's why we can't open the writer in a model on_save handler or something. - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) + response = self.client.get("/api/documents/?query=bank") results = response.data["results"] self.assertEqual(response.data["count"], 3) @@ -98,9 +104,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): checksum="B", pk=2, ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) response = self.client.get( "/api/documents/?query=bank", @@ -127,8 +133,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): ) matching_doc.tags.add(tag) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, matching_doc) + get_backend().add_or_update(matching_doc) response = self.client.get( "/api/documents/?query=bank&include_selection_data=true", @@ -187,10 +192,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): value_int=20, ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get( f"/api/documents/?query=match&ordering=custom_field_{custom_field.pk}", @@ -211,15 +216,15 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): ) def test_search_multi_page(self) -> None: - with AsyncWriter(index.open_index()) as writer: - for i in range(55): - doc = Document.objects.create( - checksum=str(i), - pk=i + 1, - title=f"Document {i + 1}", - content="content", - ) - index.update_document(writer, doc) + backend = get_backend() + for i in range(55): + doc = Document.objects.create( + checksum=str(i), + pk=i + 1, + title=f"Document {i + 1}", + content="content", + ) + backend.add_or_update(doc) # This is here so that we test that no document gets returned twice (might happen if the paging is not working) seen_ids = [] @@ -246,15 +251,15 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): seen_ids.append(result["id"]) def test_search_invalid_page(self) -> None: - with AsyncWriter(index.open_index()) as writer: - for i in range(15): - doc = Document.objects.create( - checksum=str(i), - pk=i + 1, - title=f"Document {i + 1}", - content="content", - ) - index.update_document(writer, doc) + backend = get_backend() + for i in range(15): + doc = Document.objects.create( + checksum=str(i), + pk=i + 1, + title=f"Document {i + 1}", + content="content", + ) + backend.add_or_update(doc) response = self.client.get("/api/documents/?query=content&page=0&page_size=10") self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) @@ -292,26 +297,25 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): pk=3, checksum="C", ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:[-1 week to now]") results = response.data["results"] # Expect 3 documents returned self.assertEqual(len(results), 3) - for idx, subset in enumerate( - [ - {"id": 1, "title": "invoice"}, - {"id": 2, "title": "bank statement 1"}, - {"id": 3, "title": "bank statement 3"}, - ], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + result_map = {r["id"]: r for r in results} + self.assertEqual(set(result_map.keys()), {1, 2, 3}) + for subset in [ + {"id": 1, "title": "invoice"}, + {"id": 2, "title": "bank statement 1"}, + {"id": 3, "title": "bank statement 3"}, + ]: + r = result_map[subset["id"]] + self.assertDictEqual(r, {**r, **subset}) @override_settings( TIME_ZONE="America/Chicago", @@ -347,10 +351,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # 7 days, 1 hour and 1 minute ago added=timezone.now() - timedelta(days=7, hours=1, minutes=1), ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:[-1 week to now]") results = response.data["results"] @@ -358,12 +362,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Expect 2 documents returned self.assertEqual(len(results), 2) - for idx, subset in enumerate( - [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + result_map = {r["id"]: r for r in results} + self.assertEqual(set(result_map.keys()), {1, 2}) + for subset in [ + {"id": 1, "title": "invoice"}, + {"id": 2, "title": "bank statement 1"}, + ]: + r = result_map[subset["id"]] + self.assertDictEqual(r, {**r, **subset}) @override_settings( TIME_ZONE="Europe/Sofia", @@ -399,10 +405,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # 7 days, 1 hour and 1 minute ago added=timezone.now() - timedelta(days=7, hours=1, minutes=1), ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:[-1 week to now]") results = response.data["results"] @@ -410,12 +416,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Expect 2 documents returned self.assertEqual(len(results), 2) - for idx, subset in enumerate( - [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + result_map = {r["id"]: r for r in results} + self.assertEqual(set(result_map.keys()), {1, 2}) + for subset in [ + {"id": 1, "title": "invoice"}, + {"id": 2, "title": "bank statement 1"}, + ]: + r = result_map[subset["id"]] + self.assertDictEqual(r, {**r, **subset}) def test_search_added_in_last_month(self) -> None: """ @@ -451,10 +459,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): added=timezone.now() - timedelta(days=7, hours=1, minutes=1), ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:[-1 month to now]") results = response.data["results"] @@ -462,12 +470,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Expect 2 documents returned self.assertEqual(len(results), 2) - for idx, subset in enumerate( - [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + result_map = {r["id"]: r for r in results} + self.assertEqual(set(result_map.keys()), {1, 3}) + for subset in [ + {"id": 1, "title": "invoice"}, + {"id": 3, "title": "bank statement 3"}, + ]: + r = result_map[subset["id"]] + self.assertDictEqual(r, {**r, **subset}) @override_settings( TIME_ZONE="America/Denver", @@ -507,10 +517,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): added=timezone.now() - timedelta(days=7, hours=1, minutes=1), ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:[-1 month to now]") results = response.data["results"] @@ -518,12 +528,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Expect 2 documents returned self.assertEqual(len(results), 2) - for idx, subset in enumerate( - [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + result_map = {r["id"]: r for r in results} + self.assertEqual(set(result_map.keys()), {1, 3}) + for subset in [ + {"id": 1, "title": "invoice"}, + {"id": 3, "title": "bank statement 3"}, + ]: + r = result_map[subset["id"]] + self.assertDictEqual(r, {**r, **subset}) @override_settings( TIME_ZONE="Europe/Sofia", @@ -563,10 +575,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Django converts dates to UTC d3.refresh_from_db() - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/documents/?query=added:20231201") results = response.data["results"] @@ -574,12 +586,8 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): # Expect 1 document returned self.assertEqual(len(results), 1) - for idx, subset in enumerate( - [{"id": 3, "title": "bank statement 3"}], - ): - result = results[idx] - # Assert subset in results - self.assertDictEqual(result, {**result, **subset}) + self.assertEqual(results[0]["id"], 3) + self.assertEqual(results[0]["title"], "bank statement 3") def test_search_added_invalid_date(self) -> None: """ @@ -588,7 +596,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): WHEN: - Query with invalid added date THEN: - - No documents returned + - 400 Bad Request returned (Tantivy rejects invalid date field syntax) """ d1 = Document.objects.create( title="invoice", @@ -597,16 +605,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): pk=1, ) - with index.open_index_writer() as writer: - index.update_document(writer, d1) + get_backend().add_or_update(d1) response = self.client.get("/api/documents/?query=added:invalid-date") - results = response.data["results"] - # Expect 0 document returned - self.assertEqual(len(results), 0) + # Tantivy rejects unparsable field queries with a 400 + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - @mock.patch("documents.index.autocomplete") + @mock.patch("documents.search._backend.TantivyBackend.autocomplete") def test_search_autocomplete_limits(self, m) -> None: """ GIVEN: @@ -618,7 +624,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): - Limit requests are obeyed """ - m.side_effect = lambda ix, term, limit, user: [term for _ in range(limit)] + m.side_effect = lambda term, limit, user=None: [term for _ in range(limit)] response = self.client.get("/api/search/autocomplete/?term=test") self.assertEqual(response.status_code, status.HTTP_200_OK) @@ -671,32 +677,29 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): owner=u1, ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) + self.assertEqual(response.data, ["applebaum", "apples", "appletini"]) d3.owner = u2 - - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d3) + d3.save() + backend.add_or_update(d3) response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum"]) + self.assertEqual(response.data, ["applebaum", "apples"]) assign_perm("view_document", u1, d3) - - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d3) + backend.add_or_update(d3) response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) + self.assertEqual(response.data, ["applebaum", "apples", "appletini"]) def test_search_autocomplete_field_name_match(self) -> None: """ @@ -714,8 +717,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): checksum="1", ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) + get_backend().add_or_update(d1) response = self.client.get("/api/search/autocomplete/?term=created:2023") self.assertEqual(response.status_code, status.HTTP_200_OK) @@ -736,33 +738,36 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): checksum="1", ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) + get_backend().add_or_update(d1) response = self.client.get("/api/search/autocomplete/?term=auto") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data[0], b"auto") + self.assertEqual(response.data[0], "auto") - def test_search_spelling_suggestion(self) -> None: - with AsyncWriter(index.open_index()) as writer: - for i in range(55): - doc = Document.objects.create( - checksum=str(i), - pk=i + 1, - title=f"Document {i + 1}", - content=f"Things document {i + 1}", - ) - index.update_document(writer, doc) + def test_search_no_spelling_suggestion(self) -> None: + """ + GIVEN: + - Documents exist with various terms + WHEN: + - Query for documents with any term + THEN: + - corrected_query is always None (Tantivy has no spell correction) + """ + backend = get_backend() + for i in range(5): + doc = Document.objects.create( + checksum=str(i), + pk=i + 1, + title=f"Document {i + 1}", + content=f"Things document {i + 1}", + ) + backend.add_or_update(doc) response = self.client.get("/api/documents/?query=thing") - correction = response.data["corrected_query"] - - self.assertEqual(correction, "things") + self.assertIsNone(response.data["corrected_query"]) response = self.client.get("/api/documents/?query=things") - correction = response.data["corrected_query"] - - self.assertEqual(correction, None) + self.assertIsNone(response.data["corrected_query"]) def test_search_spelling_suggestion_suppressed_for_private_terms(self): owner = User.objects.create_user("owner") @@ -771,24 +776,24 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): Permission.objects.get(codename="view_document"), ) - with AsyncWriter(index.open_index()) as writer: - for i in range(55): - private_doc = Document.objects.create( - checksum=f"p{i}", - pk=100 + i, - title=f"Private Document {i + 1}", - content=f"treasury document {i + 1}", - owner=owner, - ) - visible_doc = Document.objects.create( - checksum=f"v{i}", - pk=200 + i, - title=f"Visible Document {i + 1}", - content=f"public ledger {i + 1}", - owner=attacker, - ) - index.update_document(writer, private_doc) - index.update_document(writer, visible_doc) + backend = get_backend() + for i in range(5): + private_doc = Document.objects.create( + checksum=f"p{i}", + pk=100 + i, + title=f"Private Document {i + 1}", + content=f"treasury document {i + 1}", + owner=owner, + ) + visible_doc = Document.objects.create( + checksum=f"v{i}", + pk=200 + i, + title=f"Visible Document {i + 1}", + content=f"public ledger {i + 1}", + owner=attacker, + ) + backend.add_or_update(private_doc) + backend.add_or_update(visible_doc) self.client.force_authenticate(user=attacker) @@ -798,26 +803,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): self.assertEqual(response.data["count"], 0) self.assertIsNone(response.data["corrected_query"]) - @mock.patch( - "whoosh.searching.Searcher.correct_query", - side_effect=Exception("Test error"), - ) - def test_corrected_query_error(self, mock_correct_query) -> None: - """ - GIVEN: - - A query that raises an error on correction - WHEN: - - API request for search with that query - THEN: - - The error is logged and the search proceeds - """ - with self.assertLogs("paperless.index", level="INFO") as cm: - response = self.client.get("/api/documents/?query=2025-06-04") - self.assertEqual(response.status_code, status.HTTP_200_OK) - error_str = cm.output[0] - expected_str = "Error while correcting query '2025-06-04': Test error" - self.assertIn(expected_str, error_str) - def test_search_more_like(self) -> None: """ GIVEN: @@ -847,16 +832,16 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): checksum="C", ) d4 = Document.objects.create( - title="Monty Python & the Holy Grail", - content="And now for something completely different", + title="Quarterly Report", + content="quarterly revenue profit margin earnings growth", pk=4, checksum="ABC", ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) - index.update_document(writer, d4) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) + backend.add_or_update(d4) response = self.client.get(f"/api/documents/?more_like_id={d2.id}") @@ -864,9 +849,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): results = response.data["results"] - self.assertEqual(len(results), 2) - self.assertEqual(results[0]["id"], d3.id) - self.assertEqual(results[1]["id"], d1.id) + self.assertGreaterEqual(len(results), 1) + result_ids = [r["id"] for r in results] + self.assertIn(d3.id, result_ids) + self.assertNotIn(d4.id, result_ids) def test_search_more_like_requires_view_permission_on_seed_document( self, @@ -908,10 +894,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): pk=12, ) - with AsyncWriter(index.open_index()) as writer: - index.update_document(writer, private_seed) - index.update_document(writer, visible_doc) - index.update_document(writer, other_doc) + backend = get_backend() + backend.add_or_update(private_seed) + backend.add_or_update(visible_doc) + backend.add_or_update(other_doc) self.client.force_authenticate(user=attacker) @@ -985,9 +971,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): value_text="foobard4", ) - with AsyncWriter(index.open_index()) as writer: - for doc in Document.objects.all(): - index.update_document(writer, doc) + backend = get_backend() + for doc in Document.objects.all(): + backend.add_or_update(doc) def search_query(q): r = self.client.get("/api/documents/?query=test" + q) @@ -1203,9 +1189,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): Document.objects.create(checksum="3", content="test 3", owner=u2) Document.objects.create(checksum="4", content="test 4") - with AsyncWriter(index.open_index()) as writer: - for doc in Document.objects.all(): - index.update_document(writer, doc) + backend = get_backend() + for doc in Document.objects.all(): + backend.add_or_update(doc) self.client.force_authenticate(user=u1) r = self.client.get("/api/documents/?query=test") @@ -1256,9 +1242,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): d3 = Document.objects.create(checksum="3", content="test 3", owner=u2) Document.objects.create(checksum="4", content="test 4") - with AsyncWriter(index.open_index()) as writer: - for doc in Document.objects.all(): - index.update_document(writer, doc) + backend = get_backend() + for doc in Document.objects.all(): + backend.add_or_update(doc) self.client.force_authenticate(user=u1) r = self.client.get("/api/documents/?query=test") @@ -1278,9 +1264,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): assign_perm("view_document", u1, d3) assign_perm("view_document", u2, d1) - with AsyncWriter(index.open_index()) as writer: - for doc in [d1, d2, d3]: - index.update_document(writer, doc) + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) self.client.force_authenticate(user=u1) r = self.client.get("/api/documents/?query=test") @@ -1343,9 +1329,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): user=u1, ) - with AsyncWriter(index.open_index()) as writer: - for doc in Document.objects.all(): - index.update_document(writer, doc) + backend = get_backend() + for doc in Document.objects.all(): + backend.add_or_update(doc) def search_query(q): r = self.client.get("/api/documents/?query=test" + q) @@ -1378,13 +1364,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): search_query("&ordering=-num_notes"), [d1.id, d3.id, d2.id], ) + # owner sort: ORM orders by owner_id (integer); NULLs first in SQLite ASC self.assertListEqual( search_query("&ordering=owner"), - [d1.id, d2.id, d3.id], + [d3.id, d1.id, d2.id], ) self.assertListEqual( search_query("&ordering=-owner"), - [d3.id, d2.id, d1.id], + [d2.id, d1.id, d3.id], ) @mock.patch("documents.bulk_edit.bulk_update_documents") @@ -1441,12 +1428,12 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): ) set_permissions([4, 5], set_permissions={}, owner=user2, merge=False) - with index.open_index_writer() as writer: - index.update_document(writer, d1) - index.update_document(writer, d2) - index.update_document(writer, d3) - index.update_document(writer, d4) - index.update_document(writer, d5) + backend = get_backend() + backend.add_or_update(d1) + backend.add_or_update(d2) + backend.add_or_update(d3) + backend.add_or_update(d4) + backend.add_or_update(d5) correspondent1 = Correspondent.objects.create(name="bank correspondent 1") Correspondent.objects.create(name="correspondent 2") diff --git a/src/documents/tests/test_api_status.py b/src/documents/tests/test_api_status.py index b8f7d408e..32717af63 100644 --- a/src/documents/tests/test_api_status.py +++ b/src/documents/tests/test_api_status.py @@ -191,40 +191,42 @@ class TestSystemStatus(APITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data["tasks"]["celery_status"], "OK") - @override_settings(INDEX_DIR=Path("/tmp/index")) - @mock.patch("whoosh.index.FileIndex.last_modified") - def test_system_status_index_ok(self, mock_last_modified) -> None: + @mock.patch("documents.search.get_backend") + def test_system_status_index_ok(self, mock_get_backend) -> None: """ GIVEN: - - The index last modified time is set + - The index is accessible WHEN: - The user requests the system status THEN: - The response contains the correct index status """ - mock_last_modified.return_value = 1707839087 - self.client.force_login(self.user) - response = self.client.get(self.ENDPOINT) + mock_get_backend.return_value = mock.MagicMock() + # Use the temp dir created in setUp (self.tmp_dir) as a real INDEX_DIR + # with a real file so the mtime lookup works + sentinel = self.tmp_dir / "sentinel.txt" + sentinel.write_text("ok") + with self.settings(INDEX_DIR=self.tmp_dir): + self.client.force_login(self.user) + response = self.client.get(self.ENDPOINT) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data["tasks"]["index_status"], "OK") self.assertIsNotNone(response.data["tasks"]["index_last_modified"]) - @override_settings(INDEX_DIR=Path("/tmp/index/")) - @mock.patch("documents.index.open_index", autospec=True) - def test_system_status_index_error(self, mock_open_index) -> None: + @mock.patch("documents.search.get_backend") + def test_system_status_index_error(self, mock_get_backend) -> None: """ GIVEN: - - The index is not found + - The index cannot be opened WHEN: - The user requests the system status THEN: - The response contains the correct index status """ - mock_open_index.return_value = None - mock_open_index.side_effect = Exception("Index error") + mock_get_backend.side_effect = Exception("Index error") self.client.force_login(self.user) response = self.client.get(self.ENDPOINT) - mock_open_index.assert_called_once() + mock_get_backend.assert_called_once() self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data["tasks"]["index_status"], "ERROR") self.assertIsNotNone(response.data["tasks"]["index_error"]) diff --git a/src/documents/tests/test_delayedquery.py b/src/documents/tests/test_delayedquery.py deleted file mode 100644 index 6357d9030..000000000 --- a/src/documents/tests/test_delayedquery.py +++ /dev/null @@ -1,58 +0,0 @@ -from django.test import TestCase -from whoosh import query - -from documents.index import get_permissions_criterias -from documents.models import User - - -class TestDelayedQuery(TestCase): - def setUp(self) -> None: - super().setUp() - # all tests run without permission criteria, so has_no_owner query will always - # be appended. - self.has_no_owner = query.Or([query.Term("has_owner", text=False)]) - - def _get_testset__id__in(self, param, field): - return ( - {f"{param}__id__in": "42,43"}, - query.And( - [ - query.Or( - [ - query.Term(f"{field}_id", "42"), - query.Term(f"{field}_id", "43"), - ], - ), - self.has_no_owner, - ], - ), - ) - - def _get_testset__id__none(self, param, field): - return ( - {f"{param}__id__none": "42,43"}, - query.And( - [ - query.Not(query.Term(f"{field}_id", "42")), - query.Not(query.Term(f"{field}_id", "43")), - self.has_no_owner, - ], - ), - ) - - def test_get_permission_criteria(self) -> None: - # tests contains tuples of user instances and the expected filter - tests = ( - (None, [query.Term("has_owner", text=False)]), - (User(42, username="foo", is_superuser=True), []), - ( - User(42, username="foo", is_superuser=False), - [ - query.Term("has_owner", text=False), - query.Term("owner_id", 42), - query.Term("viewer_id", "42"), - ], - ), - ) - for user, expected in tests: - self.assertEqual(get_permissions_criterias(user), expected) diff --git a/src/documents/tests/test_index.py b/src/documents/tests/test_index.py deleted file mode 100644 index 5f1c7487d..000000000 --- a/src/documents/tests/test_index.py +++ /dev/null @@ -1,371 +0,0 @@ -from datetime import datetime -from unittest import mock - -from django.conf import settings -from django.contrib.auth.models import User -from django.test import SimpleTestCase -from django.test import TestCase -from django.test import override_settings -from django.utils.timezone import get_current_timezone -from django.utils.timezone import timezone - -from documents import index -from documents.models import Document -from documents.tests.utils import DirectoriesMixin - - -class TestAutoComplete(DirectoriesMixin, TestCase): - def test_auto_complete(self) -> None: - doc1 = Document.objects.create( - title="doc1", - checksum="A", - content="test test2 test3", - ) - doc2 = Document.objects.create(title="doc2", checksum="B", content="test test2") - doc3 = Document.objects.create(title="doc3", checksum="C", content="test2") - - index.add_or_update_document(doc1) - index.add_or_update_document(doc2) - index.add_or_update_document(doc3) - - ix = index.open_index() - - self.assertListEqual( - index.autocomplete(ix, "tes"), - [b"test2", b"test", b"test3"], - ) - self.assertListEqual( - index.autocomplete(ix, "tes", limit=3), - [b"test2", b"test", b"test3"], - ) - self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test2"]) - self.assertListEqual(index.autocomplete(ix, "tes", limit=0), []) - - def test_archive_serial_number_ranging(self) -> None: - """ - GIVEN: - - Document with an archive serial number above schema allowed size - WHEN: - - Document is provided to the index - THEN: - - Error is logged - - Document ASN is reset to 0 for the index - """ - doc1 = Document.objects.create( - title="doc1", - checksum="A", - content="test test2 test3", - # yes, this is allowed, unless full_clean is run - # DRF does call the validators, this test won't - archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1, - ) - with self.assertLogs("paperless.index", level="ERROR") as cm: - with mock.patch( - "documents.index.AsyncWriter.update_document", - ) as mocked_update_doc: - index.add_or_update_document(doc1) - - mocked_update_doc.assert_called_once() - _, kwargs = mocked_update_doc.call_args - - self.assertEqual(kwargs["asn"], 0) - - error_str = cm.output[0] - expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1" - self.assertIn(expected_str, error_str) - - def test_archive_serial_number_is_none(self) -> None: - """ - GIVEN: - - Document with no archive serial number - WHEN: - - Document is provided to the index - THEN: - - ASN isn't touched - """ - doc1 = Document.objects.create( - title="doc1", - checksum="A", - content="test test2 test3", - ) - with mock.patch( - "documents.index.AsyncWriter.update_document", - ) as mocked_update_doc: - index.add_or_update_document(doc1) - - mocked_update_doc.assert_called_once() - _, kwargs = mocked_update_doc.call_args - - self.assertIsNone(kwargs["asn"]) - - @override_settings(TIME_ZONE="Pacific/Auckland") - def test_added_today_respects_local_timezone_boundary(self) -> None: - tz = get_current_timezone() - fixed_now = datetime(2025, 7, 20, 15, 0, 0, tzinfo=tz) - - # Fake a time near the local boundary (1 AM NZT = 13:00 UTC on previous UTC day) - local_dt = datetime(2025, 7, 20, 1, 0, 0).replace(tzinfo=tz) - utc_dt = local_dt.astimezone(timezone.utc) - - doc = Document.objects.create( - title="Time zone", - content="Testing added:today", - checksum="edgecase123", - added=utc_dt, - ) - - with index.open_index_writer() as writer: - index.update_document(writer, doc) - - superuser = User.objects.create_superuser(username="testuser") - self.client.force_login(superuser) - - with mock.patch("documents.index.now", return_value=fixed_now): - response = self.client.get("/api/documents/?query=added:today") - results = response.json()["results"] - self.assertEqual(len(results), 1) - self.assertEqual(results[0]["id"], doc.id) - - response = self.client.get("/api/documents/?query=added:yesterday") - results = response.json()["results"] - self.assertEqual(len(results), 0) - - -@override_settings(TIME_ZONE="UTC") -class TestRewriteNaturalDateKeywords(SimpleTestCase): - """ - Unit tests for rewrite_natural_date_keywords function. - """ - - def _rewrite_with_now(self, query: str, now_dt: datetime) -> str: - with mock.patch("documents.index.now", return_value=now_dt): - return index.rewrite_natural_date_keywords(query) - - def _assert_rewrite_contains( - self, - query: str, - now_dt: datetime, - *expected_fragments: str, - ) -> str: - result = self._rewrite_with_now(query, now_dt) - for fragment in expected_fragments: - self.assertIn(fragment, result) - return result - - def test_range_keywords(self) -> None: - """ - Test various different range keywords - """ - cases = [ - ( - "added:today", - datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc), - ("added:[20250720", "TO 20250720"), - ), - ( - "added:yesterday", - datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc), - ("added:[20250719", "TO 20250719"), - ), - ( - "added:this month", - datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20250701", "TO 20250731"), - ), - ( - "added:previous month", - datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20250601", "TO 20250630"), - ), - ( - "added:this year", - datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20250101", "TO 20251231"), - ), - ( - "added:previous year", - datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20240101", "TO 20241231"), - ), - # Previous quarter from July 15, 2025 is April-June. - ( - "added:previous quarter", - datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20250401", "TO 20250630"), - ), - # July 20, 2025 is a Sunday (weekday 6) so previous week is July 7-13. - ( - "added:previous week", - datetime(2025, 7, 20, 12, 0, 0, tzinfo=timezone.utc), - ("added:[20250707", "TO 20250713"), - ), - ] - - for query, now_dt, fragments in cases: - with self.subTest(query=query): - self._assert_rewrite_contains(query, now_dt, *fragments) - - def test_additional_fields(self) -> None: - fixed_now = datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc) - # created - self._assert_rewrite_contains("created:today", fixed_now, "created:[20250720") - # modified - self._assert_rewrite_contains("modified:today", fixed_now, "modified:[20250720") - - def test_basic_syntax_variants(self) -> None: - """ - Test that quoting, casing, and multi-clause queries are parsed. - """ - fixed_now = datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc) - - # quoted keywords - result1 = self._rewrite_with_now('added:"today"', fixed_now) - result2 = self._rewrite_with_now("added:'today'", fixed_now) - self.assertIn("added:[20250720", result1) - self.assertIn("added:[20250720", result2) - - # case insensitivity - for query in ("added:TODAY", "added:Today", "added:ToDaY"): - with self.subTest(case_variant=query): - self._assert_rewrite_contains(query, fixed_now, "added:[20250720") - - # multiple clauses - result = self._rewrite_with_now("added:today created:yesterday", fixed_now) - self.assertIn("added:[20250720", result) - self.assertIn("created:[20250719", result) - - def test_no_match(self) -> None: - """ - Test that queries without keywords are unchanged. - """ - query = "title:test content:example" - result = index.rewrite_natural_date_keywords(query) - self.assertEqual(query, result) - - @override_settings(TIME_ZONE="Pacific/Auckland") - def test_timezone_awareness(self) -> None: - """ - Test timezone conversion. - """ - # July 20, 2025 1:00 AM NZST = July 19, 2025 13:00 UTC - fixed_now = datetime(2025, 7, 20, 1, 0, 0, tzinfo=get_current_timezone()) - result = self._rewrite_with_now("added:today", fixed_now) - # Should convert to UTC properly - self.assertIn("added:[20250719", result) - - -class TestIndexResilience(DirectoriesMixin, SimpleTestCase): - def _assert_recreate_called(self, mock_create_in) -> None: - mock_create_in.assert_called_once() - path_arg, schema_arg = mock_create_in.call_args.args - self.assertEqual(path_arg, settings.INDEX_DIR) - self.assertEqual(schema_arg.__class__.__name__, "Schema") - - def test_transient_missing_segment_does_not_force_recreate(self) -> None: - """ - GIVEN: - - Index directory exists - WHEN: - - open_index is called - - Opening the index raises FileNotFoundError once due to a - transient missing segment - THEN: - - Index is opened successfully on retry - - Index is not recreated - """ - file_marker = settings.INDEX_DIR / "file_marker.txt" - file_marker.write_text("keep") - expected_index = object() - - with ( - mock.patch("documents.index.exists_in", return_value=True), - mock.patch( - "documents.index.open_dir", - side_effect=[FileNotFoundError("missing"), expected_index], - ) as mock_open_dir, - mock.patch( - "documents.index.create_in", - ) as mock_create_in, - mock.patch( - "documents.index.rmtree", - ) as mock_rmtree, - ): - ix = index.open_index() - - self.assertIs(ix, expected_index) - self.assertGreaterEqual(mock_open_dir.call_count, 2) - mock_rmtree.assert_not_called() - mock_create_in.assert_not_called() - self.assertEqual(file_marker.read_text(), "keep") - - def test_transient_errors_exhaust_retries_and_recreate(self) -> None: - """ - GIVEN: - - Index directory exists - WHEN: - - open_index is called - - Opening the index raises FileNotFoundError multiple times due to - transient missing segments - THEN: - - Index is recreated after retries are exhausted - """ - recreated_index = object() - - with ( - self.assertLogs("paperless.index", level="ERROR") as cm, - mock.patch("documents.index.exists_in", return_value=True), - mock.patch( - "documents.index.open_dir", - side_effect=FileNotFoundError("missing"), - ) as mock_open_dir, - mock.patch("documents.index.rmtree") as mock_rmtree, - mock.patch( - "documents.index.create_in", - return_value=recreated_index, - ) as mock_create_in, - ): - ix = index.open_index() - - self.assertIs(ix, recreated_index) - self.assertEqual(mock_open_dir.call_count, 4) - mock_rmtree.assert_called_once_with(settings.INDEX_DIR) - self._assert_recreate_called(mock_create_in) - self.assertIn( - "Error while opening the index after retries, recreating.", - cm.output[0], - ) - - def test_non_transient_error_recreates_index(self) -> None: - """ - GIVEN: - - Index directory exists - WHEN: - - open_index is called - - Opening the index raises a "non-transient" error - THEN: - - Index is recreated - """ - recreated_index = object() - - with ( - self.assertLogs("paperless.index", level="ERROR") as cm, - mock.patch("documents.index.exists_in", return_value=True), - mock.patch( - "documents.index.open_dir", - side_effect=RuntimeError("boom"), - ), - mock.patch("documents.index.rmtree") as mock_rmtree, - mock.patch( - "documents.index.create_in", - return_value=recreated_index, - ) as mock_create_in, - ): - ix = index.open_index() - - self.assertIs(ix, recreated_index) - mock_rmtree.assert_called_once_with(settings.INDEX_DIR) - self._assert_recreate_called(mock_create_in) - self.assertIn( - "Error while opening the index, recreating.", - cm.output[0], - ) diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 7719d21dd..6ea4431fd 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -103,16 +103,75 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @pytest.mark.management -class TestMakeIndex(TestCase): - @mock.patch("documents.management.commands.document_index.index_reindex") - def test_reindex(self, m) -> None: +@pytest.mark.django_db +class TestMakeIndex: + def test_reindex(self, mocker: MockerFixture) -> None: + """Reindex command must call the backend rebuild method to recreate the index.""" + mock_get_backend = mocker.patch( + "documents.management.commands.document_index.get_backend", + ) call_command("document_index", "reindex", skip_checks=True) - m.assert_called_once() + mock_get_backend.return_value.rebuild.assert_called_once() - @mock.patch("documents.management.commands.document_index.index_optimize") - def test_optimize(self, m) -> None: + def test_optimize(self) -> None: + """Optimize command must execute without error (Tantivy handles optimization automatically).""" call_command("document_index", "optimize", skip_checks=True) - m.assert_called_once() + + def test_reindex_recreate_wipes_index(self, mocker: MockerFixture) -> None: + """Reindex with --recreate must wipe the index before rebuilding.""" + mock_wipe = mocker.patch( + "documents.management.commands.document_index.wipe_index", + ) + mock_get_backend = mocker.patch( + "documents.management.commands.document_index.get_backend", + ) + call_command("document_index", "reindex", recreate=True, skip_checks=True) + mock_wipe.assert_called_once() + mock_get_backend.return_value.rebuild.assert_called_once() + + def test_reindex_without_recreate_does_not_wipe_index( + self, + mocker: MockerFixture, + ) -> None: + """Reindex without --recreate must not wipe the index.""" + mock_wipe = mocker.patch( + "documents.management.commands.document_index.wipe_index", + ) + mocker.patch( + "documents.management.commands.document_index.get_backend", + ) + call_command("document_index", "reindex", skip_checks=True) + mock_wipe.assert_not_called() + + def test_reindex_if_needed_skips_when_up_to_date( + self, + mocker: MockerFixture, + ) -> None: + """Conditional reindex must skip rebuild when schema version and language match.""" + mocker.patch( + "documents.management.commands.document_index.needs_rebuild", + return_value=False, + ) + mock_get_backend = mocker.patch( + "documents.management.commands.document_index.get_backend", + ) + call_command("document_index", "reindex", if_needed=True, skip_checks=True) + mock_get_backend.return_value.rebuild.assert_not_called() + + def test_reindex_if_needed_runs_when_rebuild_needed( + self, + mocker: MockerFixture, + ) -> None: + """Conditional reindex must proceed with rebuild when schema version or language changed.""" + mocker.patch( + "documents.management.commands.document_index.needs_rebuild", + return_value=True, + ) + mock_get_backend = mocker.patch( + "documents.management.commands.document_index.get_backend", + ) + call_command("document_index", "reindex", if_needed=True, skip_checks=True) + mock_get_backend.return_value.rebuild.assert_called_once() @pytest.mark.management diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index e038bf786..e13d3827a 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -452,7 +452,10 @@ class TestDocumentConsumptionFinishedSignal(TestCase): """ def setUp(self) -> None: + from documents.search import reset_backend + TestCase.setUp(self) + reset_backend() User.objects.create_user(username="test_consumer", password="12345") self.doc_contains = Document.objects.create( content="I contain the keyword.", @@ -464,6 +467,9 @@ class TestDocumentConsumptionFinishedSignal(TestCase): override_settings(INDEX_DIR=self.index_dir).enable() def tearDown(self) -> None: + from documents.search import reset_backend + + reset_backend() shutil.rmtree(self.index_dir, ignore_errors=True) def test_tag_applied_any(self) -> None: diff --git a/src/documents/tests/test_tag_hierarchy.py b/src/documents/tests/test_tag_hierarchy.py index 12e5475f3..57aa27e3a 100644 --- a/src/documents/tests/test_tag_hierarchy.py +++ b/src/documents/tests/test_tag_hierarchy.py @@ -11,10 +11,12 @@ from documents.models import WorkflowAction from documents.models import WorkflowTrigger from documents.serialisers import TagSerializer from documents.signals.handlers import run_workflows +from documents.tests.utils import DirectoriesMixin -class TestTagHierarchy(APITestCase): +class TestTagHierarchy(DirectoriesMixin, APITestCase): def setUp(self) -> None: + super().setUp() self.user = User.objects.create_superuser(username="admin") self.client.force_authenticate(user=self.user) diff --git a/src/documents/tests/test_task_signals.py b/src/documents/tests/test_task_signals.py index 4f17a8fd2..3dcbbeaff 100644 --- a/src/documents/tests/test_task_signals.py +++ b/src/documents/tests/test_task_signals.py @@ -2,6 +2,7 @@ import uuid from unittest import mock import celery +from django.contrib.auth import get_user_model from django.test import TestCase from documents.data_models import ConsumableDocument @@ -20,6 +21,11 @@ from documents.tests.utils import DirectoriesMixin @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) class TestTaskSignalHandler(DirectoriesMixin, TestCase): + @classmethod + def setUpTestData(cls) -> None: + super().setUpTestData() + cls.user = get_user_model().objects.create_user(username="testuser") + def util_call_before_task_publish_handler( self, headers_to_use, @@ -57,7 +63,7 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): ), DocumentMetadataOverrides( title="Hello world", - owner_id=1, + owner_id=self.user.id, ), ), # kwargs @@ -75,7 +81,7 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): self.assertEqual(headers["id"], task.task_id) self.assertEqual("hello-999.pdf", task.task_file_name) self.assertEqual(PaperlessTask.TaskName.CONSUME_FILE, task.task_name) - self.assertEqual(1, task.owner_id) + self.assertEqual(self.user.id, task.owner_id) self.assertEqual(celery.states.PENDING, task.status) def test_task_prerun_handler(self) -> None: @@ -208,10 +214,12 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): mime_type="application/pdf", ) - with mock.patch("documents.index.add_or_update_document") as add: + with mock.patch("documents.search.get_backend") as mock_get_backend: + mock_backend = mock.MagicMock() + mock_get_backend.return_value = mock_backend add_to_index(sender=None, document=root) - add.assert_called_once_with(root) + mock_backend.add_or_update.assert_called_once_with(root, effective_content="") def test_add_to_index_reindexes_root_for_version_documents(self) -> None: root = Document.objects.create( @@ -226,13 +234,17 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): root_document=root, ) - with mock.patch("documents.index.add_or_update_document") as add: + with mock.patch("documents.search.get_backend") as mock_get_backend: + mock_backend = mock.MagicMock() + mock_get_backend.return_value = mock_backend add_to_index(sender=None, document=version) - self.assertEqual(add.call_count, 2) - self.assertEqual(add.call_args_list[0].args[0].id, version.id) - self.assertEqual(add.call_args_list[1].args[0].id, root.id) + self.assertEqual(mock_backend.add_or_update.call_count, 1) self.assertEqual( - add.call_args_list[1].kwargs, + mock_backend.add_or_update.call_args_list[0].args[0].id, + version.id, + ) + self.assertEqual( + mock_backend.add_or_update.call_args_list[0].kwargs, {"effective_content": version.content}, ) diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 37f1e6fed..9fb9ddbc6 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -23,29 +23,10 @@ from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin -class TestIndexReindex(DirectoriesMixin, TestCase): - def test_index_reindex(self) -> None: - Document.objects.create( - title="test", - content="my document", - checksum="wow", - added=timezone.now(), - created=timezone.now(), - modified=timezone.now(), - ) - - tasks.index_reindex() - +@pytest.mark.django_db +class TestIndexOptimize: def test_index_optimize(self) -> None: - Document.objects.create( - title="test", - content="my document", - checksum="wow", - added=timezone.now(), - created=timezone.now(), - modified=timezone.now(), - ) - + """Index optimization task must execute without error (Tantivy handles optimization automatically).""" tasks.index_optimize() diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index 58d989882..0fd893a5b 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -4802,6 +4802,7 @@ class TestWebhookSecurity: @pytest.mark.django_db +@pytest.mark.usefixtures("_search_index") class TestDateWorkflowLocalization( SampleDirMixin, ): diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index 346d895aa..cc4190974 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -157,11 +157,17 @@ class DirectoriesMixin: """ def setUp(self) -> None: + from documents.search import reset_backend + + reset_backend() self.dirs = setup_directories() super().setUp() def tearDown(self) -> None: + from documents.search import reset_backend + super().tearDown() + reset_backend() remove_dirs(self.dirs) diff --git a/src/documents/utils.py b/src/documents/utils.py index 975185a5f..2ed6758dd 100644 --- a/src/documents/utils.py +++ b/src/documents/utils.py @@ -1,14 +1,27 @@ import hashlib import logging import shutil +from collections.abc import Callable +from collections.abc import Iterable from os import utime from pathlib import Path from subprocess import CompletedProcess from subprocess import run +from typing import TypeVar from django.conf import settings from PIL import Image +_T = TypeVar("_T") + +# A function that wraps an iterable — typically used to inject a progress bar. +IterWrapper = Callable[[Iterable[_T]], Iterable[_T]] + + +def identity(iterable: Iterable[_T]) -> Iterable[_T]: + """Return the iterable unchanged; the no-op default for IterWrapper.""" + return iterable + def _coerce_to_path( source: Path | str, diff --git a/src/documents/views.py b/src/documents/views.py index 244e81161..024e846a0 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -100,7 +100,6 @@ from rest_framework.viewsets import ReadOnlyModelViewSet from rest_framework.viewsets import ViewSet from documents import bulk_edit -from documents import index from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalsOnlyStrategy @@ -1029,9 +1028,9 @@ class DocumentViewSet( response_data["content"] = content_doc.content response = Response(response_data) - from documents import index + from documents.search import get_backend - index.add_or_update_document(refreshed_doc) + get_backend().add_or_update(refreshed_doc) document_updated.send( sender=self.__class__, @@ -1060,9 +1059,9 @@ class DocumentViewSet( return Response({"results": serializer.data, "selection_data": selection_data}) def destroy(self, request, *args, **kwargs): - from documents import index + from documents.search import get_backend - index.remove_document_from_index(self.get_object()) + get_backend().remove(self.get_object().pk) try: return super().destroy(request, *args, **kwargs) except Exception as e: @@ -1469,9 +1468,9 @@ class DocumentViewSet( doc.modified = timezone.now() doc.save() - from documents import index + from documents.search import get_backend - index.add_or_update_document(doc) + get_backend().add_or_update(doc) notes = serializer.to_representation(doc).get("notes") @@ -1506,9 +1505,9 @@ class DocumentViewSet( doc.modified = timezone.now() doc.save() - from documents import index + from documents.search import get_backend - index.add_or_update_document(doc) + get_backend().add_or_update(doc) notes = serializer.to_representation(doc).get("notes") @@ -1820,12 +1819,13 @@ class DocumentViewSet( "Cannot delete the root/original version. Delete the document instead.", ) - from documents import index + from documents.search import get_backend - index.remove_document_from_index(version_doc) + _backend = get_backend() + _backend.remove(version_doc.pk) version_doc_id = version_doc.id version_doc.delete() - index.add_or_update_document(root_doc) + _backend.add_or_update(root_doc) if settings.AUDIT_LOG_ENABLED: actor = ( request.user if request.user and request.user.is_authenticated else None @@ -2025,10 +2025,6 @@ class ChatStreamingView(GenericAPIView): ), ) class UnifiedSearchViewSet(DocumentViewSet): - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.searcher = None - def get_serializer_class(self): if self._is_search_request(): return SearchResultSerializer @@ -2041,17 +2037,34 @@ class UnifiedSearchViewSet(DocumentViewSet): or "more_like_id" in self.request.query_params ) - def filter_queryset(self, queryset): - filtered_queryset = super().filter_queryset(queryset) + def list(self, request, *args, **kwargs): + if not self._is_search_request(): + return super().list(request) - if self._is_search_request(): - if "query" in self.request.query_params: - from documents import index + from documents.search import TantivyRelevanceList + from documents.search import get_backend - query_class = index.DelayedFullTextQuery - elif "more_like_id" in self.request.query_params: + try: + backend = get_backend() + # ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied) + filtered_qs = self.filter_queryset(self.get_queryset()) + + user = None if request.user.is_superuser else request.user + + if "query" in request.query_params: + query_str = request.query_params["query"] + results = backend.search( + query_str, + user=user, + page=1, + page_size=10000, + sort_field=None, + sort_reverse=False, + ) + else: + # more_like_id — validate permission on the seed document first try: - more_like_doc_id = int(self.request.query_params["more_like_id"]) + more_like_doc_id = int(request.query_params["more_like_id"]) more_like_doc = Document.objects.select_related("owner").get( pk=more_like_doc_id, ) @@ -2059,76 +2072,71 @@ class UnifiedSearchViewSet(DocumentViewSet): raise PermissionDenied(_("Invalid more_like_id")) if not has_perms_owner_aware( - self.request.user, + request.user, "view_document", more_like_doc, ): raise PermissionDenied(_("Insufficient permissions.")) - from documents import index - - query_class = index.DelayedMoreLikeThisQuery - else: - raise ValueError - - return query_class( - self.searcher, - self.request.query_params, - self.paginator.get_page_size(self.request), - filter_queryset=filtered_queryset, - ) - else: - return filtered_queryset - - def list(self, request, *args, **kwargs): - if self._is_search_request(): - from documents import index - - try: - with index.open_index_searcher() as s: - self.searcher = s - queryset = self.filter_queryset(self.get_queryset()) - page = self.paginate_queryset(queryset) - - serializer = self.get_serializer(page, many=True) - response = self.get_paginated_response(serializer.data) - - response.data["corrected_query"] = ( - queryset.suggested_correction - if hasattr(queryset, "suggested_correction") - else None - ) - - if get_boolean( - str( - request.query_params.get( - "include_selection_data", - "false", - ), - ), - ): - result_ids = queryset.get_result_ids() - response.data["selection_data"] = ( - self._get_selection_data_for_queryset( - Document.objects.filter(pk__in=result_ids), - ) - ) - - return response - except NotFound: - raise - except PermissionDenied as e: - invalid_more_like_id_message = _("Invalid more_like_id") - if str(e.detail) == str(invalid_more_like_id_message): - return HttpResponseForbidden(invalid_more_like_id_message) - return HttpResponseForbidden(_("Insufficient permissions.")) - except Exception as e: - logger.warning(f"An error occurred listing search results: {e!s}") - return HttpResponseBadRequest( - "Error listing search results, check logs for more detail.", + results = backend.more_like_this( + more_like_doc_id, + user=user, + page=1, + page_size=10000, ) - else: - return super().list(request) + + hits_by_id = {h["id"]: h for h in results.hits} + + # Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order + ordering_param = request.query_params.get("ordering", "").lstrip("-") + if not ordering_param: + # Preserve Tantivy relevance order; intersect with ORM-visible IDs + orm_ids = set(filtered_qs.values_list("pk", flat=True)) + ordered_hits = [h for h in results.hits if h["id"] in orm_ids] + else: + # Use ORM ordering (already applied by DocumentsOrderingFilter) + hit_ids = set(hits_by_id.keys()) + orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list( + "pk", + flat=True, + ) + ordered_hits = [ + hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id + ] + + rl = TantivyRelevanceList(ordered_hits) + page = self.paginate_queryset(rl) + + if page is not None: + serializer = self.get_serializer(page, many=True) + response = self.get_paginated_response(serializer.data) + response.data["corrected_query"] = None + if get_boolean( + str(request.query_params.get("include_selection_data", "false")), + ): + all_ids = [h["id"] for h in ordered_hits] + response.data["selection_data"] = ( + self._get_selection_data_for_queryset( + filtered_qs.filter(pk__in=all_ids), + ) + ) + return response + + serializer = self.get_serializer(ordered_hits, many=True) + return Response(serializer.data) + + except NotFound: + raise + except PermissionDenied as e: + invalid_more_like_id_message = _("Invalid more_like_id") + if str(e.detail) == str(invalid_more_like_id_message): + return HttpResponseForbidden(invalid_more_like_id_message) + return HttpResponseForbidden(_("Insufficient permissions.")) + except Exception as e: + logger.warning(f"An error occurred listing search results: {e!s}") + return HttpResponseBadRequest( + "Error listing search results, check logs for more detail.", + ) @action(detail=False, methods=["GET"], name="Get Next ASN") def next_asn(self, request, *args, **kwargs): @@ -2946,18 +2954,9 @@ class SearchAutoCompleteView(GenericAPIView): else: limit = 10 - from documents import index + from documents.search import get_backend - ix = index.open_index() - - return Response( - index.autocomplete( - ix, - term, - limit, - user, - ), - ) + return Response(get_backend().autocomplete(term, limit, user)) @extend_schema_view( @@ -3023,20 +3022,21 @@ class GlobalSearchView(PassUserMixin): # First search by title docs = all_docs.filter(title__icontains=query) if not db_only and len(docs) < OBJECT_LIMIT: - # If we don't have enough results, search by content - from documents import index + # If we don't have enough results, search by content. + # Over-fetch from Tantivy (no permission filter) and rely on + # the ORM all_docs queryset for authoritative permission gating. + from documents.search import get_backend - with index.open_index_searcher() as s: - fts_query = index.DelayedFullTextQuery( - s, - request.query_params, - OBJECT_LIMIT, - filter_queryset=all_docs, - ) - results = fts_query[0:1] - docs = docs | Document.objects.filter( - id__in=[r["id"] for r in results], - ) + fts_results = get_backend().search( + query, + user=None, + page=1, + page_size=1000, + sort_field=None, + sort_reverse=False, + ) + fts_ids = {h["id"] for h in fts_results.hits} + docs = docs | all_docs.filter(id__in=fts_ids) docs = docs[:OBJECT_LIMIT] saved_views = ( get_objects_for_user_owner_aware( @@ -4279,10 +4279,16 @@ class SystemStatusView(PassUserMixin): index_error = None try: - ix = index.open_index() + from documents.search import get_backend + + get_backend() # triggers open/rebuild; raises on error index_status = "OK" - index_last_modified = make_aware( - datetime.fromtimestamp(ix.last_modified()), + # Use the most-recently modified file in the index directory as a proxy + # for last index write time (Tantivy has no single last_modified() call). + index_dir = settings.INDEX_DIR + mtimes = [p.stat().st_mtime for p in index_dir.iterdir() if p.is_file()] + index_last_modified = ( + make_aware(datetime.fromtimestamp(max(mtimes))) if mtimes else None ) except Exception as e: index_status = "ERROR" diff --git a/src/paperless/settings/__init__.py b/src/paperless/settings/__init__.py index 1c33db7c6..3522b3187 100644 --- a/src/paperless/settings/__init__.py +++ b/src/paperless/settings/__init__.py @@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings from paperless.settings.custom import parse_ignore_dates from paperless.settings.custom import parse_redis_url from paperless.settings.parsers import get_bool_from_env +from paperless.settings.parsers import get_choice_from_env from paperless.settings.parsers import get_float_from_env from paperless.settings.parsers import get_int_from_env from paperless.settings.parsers import get_list_from_env @@ -85,6 +86,11 @@ EMPTY_TRASH_DIR = ( # threads. MEDIA_LOCK = MEDIA_ROOT / "media.lock" INDEX_DIR = DATA_DIR / "index" + +ADVANCED_FUZZY_SEARCH_THRESHOLD: float | None = get_float_from_env( + "PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD", +) + MODEL_FILE = get_path_from_env( "PAPERLESS_MODEL_FILE", DATA_DIR / "classification_model.pickle", @@ -1033,10 +1039,55 @@ def _get_nltk_language_setting(ocr_lang: str) -> str | None: return iso_code_to_nltk.get(ocr_lang) +def _get_search_language_setting(ocr_lang: str) -> str | None: + """ + Determine the Tantivy stemmer language. + + If PAPERLESS_SEARCH_LANGUAGE is explicitly set, it is validated against + the languages supported by Tantivy's built-in stemmer and returned as-is. + Otherwise the primary Tesseract language code from PAPERLESS_OCR_LANGUAGE + is mapped to the corresponding ISO 639-1 code understood by Tantivy. + Returns None when unset and the OCR language has no Tantivy stemmer. + """ + explicit = os.environ.get("PAPERLESS_SEARCH_LANGUAGE") + if explicit is not None: + # Lazy import avoids any app-loading order concerns; _tokenizer has no + # Django dependencies so this is safe. + from documents.search._tokenizer import SUPPORTED_LANGUAGES + + return get_choice_from_env("PAPERLESS_SEARCH_LANGUAGE", SUPPORTED_LANGUAGES) + + # Infer from the primary Tesseract language code (ISO 639-2/T → ISO 639-1) + primary = ocr_lang.split("+", maxsplit=1)[0].lower() + _ocr_to_search: dict[str, str] = { + "ara": "ar", + "dan": "da", + "nld": "nl", + "eng": "en", + "fin": "fi", + "fra": "fr", + "deu": "de", + "ell": "el", + "hun": "hu", + "ita": "it", + "nor": "no", + "por": "pt", + "ron": "ro", + "rus": "ru", + "spa": "es", + "swe": "sv", + "tam": "ta", + "tur": "tr", + } + return _ocr_to_search.get(primary) + + NLTK_ENABLED: Final[bool] = get_bool_from_env("PAPERLESS_ENABLE_NLTK", "yes") NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE) +SEARCH_LANGUAGE: str | None = _get_search_language_setting(OCR_LANGUAGE) + ############################################################################### # Email Preprocessors # ############################################################################### diff --git a/src/paperless/settings/parsers.py b/src/paperless/settings/parsers.py index 09e474bd5..163633d84 100644 --- a/src/paperless/settings/parsers.py +++ b/src/paperless/settings/parsers.py @@ -260,7 +260,7 @@ def get_list_from_env( def get_choice_from_env( env_key: str, - choices: set[str], + choices: set[str] | frozenset[str], default: str | None = None, ) -> str: """ diff --git a/src/paperless/tests/parsers/test_tesseract_custom_settings.py b/src/paperless/tests/parsers/test_tesseract_custom_settings.py index 60d1486f4..9f3afacb6 100644 --- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py +++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py @@ -14,6 +14,11 @@ from paperless.parsers.tesseract import RasterisedDocumentParser class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + @classmethod + def setUpTestData(cls) -> None: + super().setUpTestData() + ApplicationConfiguration.objects.get_or_create() + @staticmethod def get_params(): """ diff --git a/src/paperless/tests/settings/test_settings.py b/src/paperless/tests/settings/test_settings.py index b0ae3c0c5..0694d9360 100644 --- a/src/paperless/tests/settings/test_settings.py +++ b/src/paperless/tests/settings/test_settings.py @@ -2,6 +2,9 @@ import os from unittest import TestCase from unittest import mock +import pytest + +from paperless.settings import _get_search_language_setting from paperless.settings import _parse_paperless_url from paperless.settings import default_threads_per_worker @@ -32,6 +35,48 @@ class TestThreadCalculation(TestCase): self.assertLessEqual(default_workers * default_threads, i) +@pytest.mark.parametrize( + ("env_value", "expected"), + [ + ("en", "en"), + ("de", "de"), + ("fr", "fr"), + ("swedish", "swedish"), + ], +) +def test_get_search_language_setting_explicit_valid( + monkeypatch: pytest.MonkeyPatch, + env_value: str, + expected: str, +) -> None: + """ + GIVEN: + - PAPERLESS_SEARCH_LANGUAGE is set to a valid Tantivy stemmer language + WHEN: + - _get_search_language_setting is called + THEN: + - The explicit value is returned regardless of the OCR language + """ + monkeypatch.setenv("PAPERLESS_SEARCH_LANGUAGE", env_value) + assert _get_search_language_setting("deu") == expected + + +def test_get_search_language_setting_explicit_invalid( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + GIVEN: + - PAPERLESS_SEARCH_LANGUAGE is set to an unsupported language code + WHEN: + - _get_search_language_setting is called + THEN: + - ValueError is raised + """ + monkeypatch.setenv("PAPERLESS_SEARCH_LANGUAGE", "klingon") + with pytest.raises(ValueError, match="klingon"): + _get_search_language_setting("eng") + + class TestPaperlessURLSettings(TestCase): def test_paperless_url(self) -> None: """ diff --git a/src/paperless/views.py b/src/paperless/views.py index 404bc4339..a3b965f3f 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -36,7 +36,6 @@ from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response from rest_framework.viewsets import ModelViewSet -from documents.index import DelayedQuery from documents.permissions import PaperlessObjectPermissions from documents.tasks import llmindex_index from paperless.filters import GroupFilterSet @@ -83,20 +82,12 @@ class StandardPagination(PageNumberPagination): ) def get_all_result_ids(self): + from documents.search import TantivyRelevanceList + query = self.page.paginator.object_list - if isinstance(query, DelayedQuery): - try: - ids = [ - query.searcher.ixreader.stored_fields( - doc_num, - )["id"] - for doc_num in query.saved_results.get(0).results.docs() - ] - except Exception: - pass - else: - ids = self.page.paginator.object_list.values_list("pk", flat=True) - return ids + if isinstance(query, TantivyRelevanceList): + return [h["id"] for h in query._hits] + return self.page.paginator.object_list.values_list("pk", flat=True) def get_paginated_response_schema(self, schema): response_schema = super().get_paginated_response_schema(schema) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index bee8f0dd9..a54492f1f 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -1,11 +1,8 @@ import logging import shutil -from collections.abc import Callable -from collections.abc import Iterable from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING -from typing import TypeVar from celery import states from django.conf import settings @@ -13,22 +10,17 @@ from django.utils import timezone from documents.models import Document from documents.models import PaperlessTask +from documents.utils import IterWrapper +from documents.utils import identity from paperless_ai.embedding import build_llm_index_text from paperless_ai.embedding import get_embedding_dim from paperless_ai.embedding import get_embedding_model -_T = TypeVar("_T") -IterWrapper = Callable[[Iterable[_T]], Iterable[_T]] - if TYPE_CHECKING: from llama_index.core import VectorStoreIndex from llama_index.core.schema import BaseNode -def _identity(iterable: Iterable[_T]) -> Iterable[_T]: - return iterable - - logger = logging.getLogger("paperless_ai.indexing") @@ -176,7 +168,7 @@ def vector_store_file_exists(): def update_llm_index( *, - iter_wrapper: IterWrapper[Document] = _identity, + iter_wrapper: IterWrapper[Document] = identity, rebuild=False, ) -> str: """ diff --git a/uv.lock b/uv.lock index 6bbfdc53b..feffefce5 100644 --- a/uv.lock +++ b/uv.lock @@ -350,15 +350,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/73/3183c9e41ca755713bdf2cc1d0810df742c09484e2e1ddd693bee53877c1/brotli-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2d085ded05278d1c7f65560aae97b3160aeb2ea2c0b3e26204856beccb60888", size = 1488164, upload-time = "2025-11-05T18:38:53.079Z" }, ] -[[package]] -name = "cached-property" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/4b/3d870836119dbe9a5e3c9a61af8cc1a8b69d75aea564572e385882d5aefb/cached_property-2.0.1.tar.gz", hash = "sha256:484d617105e3ee0e4f1f58725e72a8ef9e93deee462222dbd51cd91230897641", size = 10574, upload-time = "2024-10-25T15:43:55.667Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/0e/7d8225aab3bc1a0f5811f8e1b557aa034ac04bdf641925b30d3caf586b28/cached_property-2.0.1-py3-none-any.whl", hash = "sha256:f617d70ab1100b7bcf6e42228f9ddcb78c676ffa167278d9f730d1c2fba69ccb", size = 7428, upload-time = "2024-10-25T15:43:54.711Z" }, -] - [[package]] name = "cbor2" version = "5.9.0" @@ -2910,12 +2901,12 @@ dependencies = [ { name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "sentence-transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "tantivy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" }, { name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "zxing-cpp", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, ] @@ -2951,6 +2942,7 @@ dev = [ { name = "pytest-sugar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pytest-xdist", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "ruff", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "time-machine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "zensical", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, ] docs = [ @@ -2974,6 +2966,7 @@ testing = [ { name = "pytest-rerunfailures", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pytest-sugar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pytest-xdist", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "time-machine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, ] typing = [ { name = "celery-types", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -3064,11 +3057,11 @@ requires-dist = [ { name = "scikit-learn", specifier = "~=1.8.0" }, { name = "sentence-transformers", specifier = ">=4.1" }, { name = "setproctitle", specifier = "~=1.3.4" }, + { name = "tantivy", specifier = ">=0.25.1" }, { name = "tika-client", specifier = "~=0.10.0" }, { name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "watchfiles", specifier = ">=1.1.1" }, { name = "whitenoise", specifier = "~=6.11" }, - { name = "whoosh-reloaded", specifier = ">=2.7.5" }, { name = "zxing-cpp", specifier = "~=3.0.0" }, ] provides-extras = ["mariadb", "postgres", "webserver"] @@ -3090,6 +3083,7 @@ dev = [ { name = "pytest-sugar" }, { name = "pytest-xdist", specifier = "~=3.8.0" }, { name = "ruff", specifier = "~=0.15.0" }, + { name = "time-machine", specifier = ">=2.13" }, { name = "zensical", specifier = ">=0.0.21" }, ] docs = [{ name = "zensical", specifier = ">=0.0.21" }] @@ -3111,6 +3105,7 @@ testing = [ { name = "pytest-rerunfailures", specifier = "~=16.1" }, { name = "pytest-sugar" }, { name = "pytest-xdist", specifier = "~=3.8.0" }, + { name = "time-machine", specifier = ">=2.13" }, ] typing = [ { name = "celery-types" }, @@ -4664,6 +4659,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tantivy" +version = "0.25.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/f9/0cd3955d155d3e3ef74b864769514dd191e5dacba9f0beb7af2d914942ce/tantivy-0.25.1.tar.gz", hash = "sha256:68a3314699a7d18fcf338b52bae8ce46a97dde1128a3e47e33fa4db7f71f265e", size = 75120, upload-time = "2025-12-02T11:57:12.997Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/7a/8a277f377e8a151fc0e71d4ffc1114aefb6e5e1c7dd609fed0955cf34ed8/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:d363d7b4207d3a5aa7f0d212420df35bed18bdb6bae26a2a8bd57428388b7c29", size = 7637033, upload-time = "2025-12-02T11:56:18.104Z" }, + { url = "https://files.pythonhosted.org/packages/71/31/8b4acdedfc9f9a2d04b1340d07eef5213d6f151d1e18da0cb423e5f090d2/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8f4389cf1d889a1df7c5a3195806b4b56c37cee10d8a26faaa0dea35a867b5ff", size = 3932180, upload-time = "2025-12-02T11:56:19.833Z" }, + { url = "https://files.pythonhosted.org/packages/2f/dc/3e8499c21b4b9795e8f2fc54c68ce5b92905aaeadadaa56ecfa9180b11b1/tantivy-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99864c09fc54652c3c2486cdf13f86cdc8200f4b481569cb291e095ca5d496e5", size = 4197620, upload-time = "2025-12-02T11:56:21.496Z" }, + { url = "https://files.pythonhosted.org/packages/f8/8e/f2ce62fffc811eb62bead92c7b23c2e218f817cbd54c4f3b802e03ba1438/tantivy-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05abf37ddbc5063c575548be0d62931629c086bff7a5a1b67cf5a8f5ebf4cd8c", size = 4183794, upload-time = "2025-12-02T11:56:23.215Z" }, + { url = "https://files.pythonhosted.org/packages/41/e7/6849c713ed0996c7628324c60512c4882006f0a62145e56c624a93407f90/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:90fd919e5f611809f746560ecf36eb9be824dec62e21ae17a27243759edb9aa1", size = 7621494, upload-time = "2025-12-02T11:56:27.069Z" }, + { url = "https://files.pythonhosted.org/packages/c5/22/c3d8294600dc6e7fa350daef9ff337d3c06e132b81df727de9f7a50c692a/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4613c7cf6c23f3a97989819690a0f956d799354957de7a204abcc60083cebe02", size = 3925219, upload-time = "2025-12-02T11:56:29.403Z" }, + { url = "https://files.pythonhosted.org/packages/41/fc/cbb1df71dd44c9110eff4eaaeda9d44f2d06182fe0452193be20ddfba93f/tantivy-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c477bd20b4df804d57dfc5033431bef27cde605695ae141b03abbf6ebc069129", size = 4198699, upload-time = "2025-12-02T11:56:31.359Z" }, + { url = "https://files.pythonhosted.org/packages/47/4d/71abb78b774073c3ce12a4faa4351a9d910a71ffa3659526affba163873d/tantivy-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b1a1ba1113c523c7ff7b10f282d6c4074006f7ef8d71e1d973d51bf7291ddb", size = 4183585, upload-time = "2025-12-02T11:56:33.317Z" }, + { url = "https://files.pythonhosted.org/packages/3d/25/73cfbcf1a8ea49be6c42817431cac46b70a119fe64da903fcc2d92b5b511/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f51ff7196c6f31719202080ed8372d5e3d51e92c749c032fb8234f012e99744c", size = 7622530, upload-time = "2025-12-02T11:56:36.839Z" }, + { url = "https://files.pythonhosted.org/packages/12/c8/c0d7591cdf4f7e7a9fc4da786d1ca8cd1aacffaa2be16ea6d401a8e4a566/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:550e63321bfcacc003859f2fa29c1e8e56450807b3c9a501c1add27cfb9236d9", size = 3925637, upload-time = "2025-12-02T11:56:38.425Z" }, + { url = "https://files.pythonhosted.org/packages/3a/09/bedfc223bffec7641b417dd7ab071134b2ef8f8550e9b1fb6014657ef52e/tantivy-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde31cc8d6e122faf7902aeea32bc008a429a6e8904e34d3468126a3ec01b016", size = 4197322, upload-time = "2025-12-02T11:56:40.411Z" }, + { url = "https://files.pythonhosted.org/packages/f5/f1/1fa5183500c8042200c9f2b840d34f5bbcfb434a1ee750e7132262d2a5c9/tantivy-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b11bd5a518b0be645320b47af8493f6a40c4f3234313e37adcf4534a564d27dd", size = 4183143, upload-time = "2025-12-02T11:56:42.048Z" }, + { url = "https://files.pythonhosted.org/packages/8b/2f/581519492226f97d23bd0adc95dad991ebeaa73ea6abc8bff389a3096d9a/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dae99e75b7eaa9bf5bd16ab106b416370f08c135aed0e117d62a3201cd1ffe36", size = 7610316, upload-time = "2025-12-02T11:56:45.927Z" }, + { url = "https://files.pythonhosted.org/packages/91/40/5d7bc315ab9e6a22c5572656e8ada1c836cfa96dccf533377504fbc3c9d9/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:506e9533c5ef4d3df43bad64ffecc0aa97c76e361ea610815dc3a20a9d6b30b3", size = 3919882, upload-time = "2025-12-02T11:56:48.469Z" }, + { url = "https://files.pythonhosted.org/packages/02/b9/e0ef2f57a6a72444cb66c2ffbc310ab33ffaace275f1c4b0319d84ea3f18/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dbd4f8f264dacbcc9dee542832da2173fd53deaaea03f082d95214f8b5ed6bc", size = 4196031, upload-time = "2025-12-02T11:56:50.151Z" }, + { url = "https://files.pythonhosted.org/packages/1e/02/bf3f8cacfd08642e14a73f7956a3fb95d58119132c98c121b9065a1f8615/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:824c643ccb640dd9e35e00c5d5054ddf3323f56fe4219d57d428a9eeea13d22c", size = 4183437, upload-time = "2025-12-02T11:56:51.818Z" }, + { url = "https://files.pythonhosted.org/packages/ff/44/9f1d67aa5030f7eebc966c863d1316a510a971dd8bb45651df4acdfae9ed/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7f5d29ae85dd0f23df8d15b3e7b341d4f9eb5a446bbb9640df48ac1f6d9e0c6c", size = 7623723, upload-time = "2025-12-02T11:56:55.066Z" }, + { url = "https://files.pythonhosted.org/packages/db/30/6e085bd3ed9d12da3c91c185854abd70f9dfd35fb36a75ea98428d42c30b/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f2d2938fb69a74fc1bb36edfaf7f0d1596fa1264db0f377bda2195c58bcb6245", size = 3926243, upload-time = "2025-12-02T11:56:57.058Z" }, + { url = "https://files.pythonhosted.org/packages/32/f5/a00d65433430f51718e5cc6938df571765d7c4e03aedec5aef4ab567aa9b/tantivy-0.25.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f5ff124c4802558e627091e780b362ca944169736caba5a372eef39a79d0ae0", size = 4207186, upload-time = "2025-12-02T11:56:58.803Z" }, + { url = "https://files.pythonhosted.org/packages/19/63/61bdb12fc95f2a7f77bd419a5149bfa9f28caa76cb569bf2b6b06e1d033e/tantivy-0.25.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43b80ef62a340416139c93d19264e5f808da48e04f9305f1092b8ed22be0a5be", size = 4187312, upload-time = "2025-12-02T11:57:00.595Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" @@ -4752,6 +4775,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" }, ] +[[package]] +name = "time-machine" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/fc/37b02f6094dbb1f851145330460532176ed2f1dc70511a35828166c41e52/time_machine-3.2.0.tar.gz", hash = "sha256:a4ddd1cea17b8950e462d1805a42b20c81eb9aafc8f66b392dd5ce997e037d79", size = 14804, upload-time = "2025-12-17T23:33:02.599Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/e1/03aae5fbaa53859f665094af696338fc7cae733d926a024af69982712350/time_machine-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c188a9dda9fcf975022f1b325b466651b96a4dfc223c523ed7ed8d979f9bf3e8", size = 19143, upload-time = "2025-12-17T23:31:44.258Z" }, + { url = "https://files.pythonhosted.org/packages/75/8f/98cb17bebb52b22ff4ec26984dd44280f9c71353c3bae0640a470e6683e5/time_machine-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17245f1cc2dd13f9d63a174be59bb2684a9e5e0a112ab707e37be92068cd655f", size = 15273, upload-time = "2025-12-17T23:31:45.246Z" }, + { url = "https://files.pythonhosted.org/packages/dd/2f/ca11e4a7897234bb9331fcc5f4ed4714481ba4012370cc79a0ae8c42ea0a/time_machine-3.2.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d9bd1de1996e76efd36ae15970206c5089fb3728356794455bd5cd8d392b5537", size = 31049, upload-time = "2025-12-17T23:31:46.613Z" }, + { url = "https://files.pythonhosted.org/packages/cf/ad/d17d83a59943094e6b6c6a3743caaf6811b12203c3e07a30cc7bcc2ab7ee/time_machine-3.2.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:98493cd50e8b7f941eab69b9e18e697ad69db1a0ec1959f78f3d7b0387107e5c", size = 32632, upload-time = "2025-12-17T23:31:47.72Z" }, + { url = "https://files.pythonhosted.org/packages/71/50/d60576d047a0dfb5638cdfb335e9c3deb6e8528544fa0b3966a8480f72b7/time_machine-3.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31f2a33d595d9f91eb9bc7f157f0dc5721f5789f4c4a9e8b852cdedb2a7d9b16", size = 34289, upload-time = "2025-12-17T23:31:48.913Z" }, + { url = "https://files.pythonhosted.org/packages/fa/fe/4afa602dbdebddde6d0ea4a7fe849e49b9bb85dc3fb415725a87ccb4b471/time_machine-3.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9f78ac4213c10fbc44283edd1a29cfb7d3382484f4361783ddc057292aaa1889", size = 33175, upload-time = "2025-12-17T23:31:50.611Z" }, + { url = "https://files.pythonhosted.org/packages/0d/87/c152e23977c1d7d7c94eb3ed3ea45cc55971796205125c6fdff40db2c60f/time_machine-3.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c1326b09e947b360926d529a96d1d9e126ce120359b63b506ecdc6ee20755c23", size = 31170, upload-time = "2025-12-17T23:31:51.645Z" }, + { url = "https://files.pythonhosted.org/packages/80/af/54acf51d0f3ade3b51eab73df6192937c9a938753ef5456dff65eb8630be/time_machine-3.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9f2949f03d15264cc15c38918a2cda8966001f0f4ebe190cbfd9c56d91aed8ac", size = 32292, upload-time = "2025-12-17T23:31:52.803Z" }, + { url = "https://files.pythonhosted.org/packages/71/8b/080c8eedcd67921a52ba5bd0e075362062509ab63c86fc1a0442fad241a6/time_machine-3.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cc4bee5b0214d7dc4ebc91f4a4c600f1a598e9b5606ac751f42cb6f6740b1dbb", size = 19255, upload-time = "2025-12-17T23:31:58.057Z" }, + { url = "https://files.pythonhosted.org/packages/66/17/0e5291e9eb705bf8a5a1305f826e979af307bbeb79def4ddbf4b3f9a81e0/time_machine-3.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ca036304b4460ae2fdc1b52dd8b1fa7cf1464daa427fc49567413c09aa839c1", size = 15360, upload-time = "2025-12-17T23:31:59.048Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e8/9ab87b71d2e2b62463b9b058b7ae7ac09fb57f8fcd88729dec169d304340/time_machine-3.2.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5442735b41d7a2abc2f04579b4ca6047ed4698a8338a4fec92c7c9423e7938cb", size = 33029, upload-time = "2025-12-17T23:32:00.413Z" }, + { url = "https://files.pythonhosted.org/packages/4b/26/b5ca19da6f25ea905b3e10a0ea95d697c1aeba0404803a43c68f1af253e6/time_machine-3.2.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:97da3e971e505cb637079fb07ab0bcd36e33279f8ecac888ff131f45ef1e4d8d", size = 34579, upload-time = "2025-12-17T23:32:01.431Z" }, + { url = "https://files.pythonhosted.org/packages/79/ca/6ac7ad5f10ea18cc1d9de49716ba38c32132c7b64532430d92ef240c116b/time_machine-3.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3cdda6dee4966e38aeb487309bb414c6cb23a81fc500291c77a8fcd3098832e7", size = 35961, upload-time = "2025-12-17T23:32:02.521Z" }, + { url = "https://files.pythonhosted.org/packages/33/67/390dd958bed395ab32d79a9fe61fe111825c0dd4ded54dbba7e867f171e6/time_machine-3.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:33d9efd302a6998bcc8baa4d84f259f8a4081105bd3d7f7af7f1d0abd3b1c8aa", size = 34668, upload-time = "2025-12-17T23:32:03.585Z" }, + { url = "https://files.pythonhosted.org/packages/da/57/c88fff034a4e9538b3ae7c68c9cfb283670b14d17522c5a8bc17d29f9a4b/time_machine-3.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3a0b0a33971f14145853c9bd95a6ab0353cf7e0019fa2a7aa1ae9fddfe8eab50", size = 32891, upload-time = "2025-12-17T23:32:04.656Z" }, + { url = "https://files.pythonhosted.org/packages/2d/70/ebbb76022dba0fec8f9156540fc647e4beae1680c787c01b1b6200e56d70/time_machine-3.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2d0be9e5f22c38082d247a2cdcd8a936504e9db60b7b3606855fb39f299e9548", size = 34080, upload-time = "2025-12-17T23:32:06.146Z" }, + { url = "https://files.pythonhosted.org/packages/ee/cd/43ad5efc88298af3c59b66769cea7f055567a85071579ed40536188530c1/time_machine-3.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c421a8eb85a4418a7675a41bf8660224318c46cc62e4751c8f1ceca752059090", size = 19318, upload-time = "2025-12-17T23:32:10.518Z" }, + { url = "https://files.pythonhosted.org/packages/b0/f6/084010ef7f4a3f38b5a4900923d7c85b29e797655c4f6ee4ce54d903cca8/time_machine-3.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f4e758f7727d0058c4950c66b58200c187072122d6f7a98b610530a4233ea7b", size = 15390, upload-time = "2025-12-17T23:32:11.625Z" }, + { url = "https://files.pythonhosted.org/packages/25/aa/1cabb74134f492270dc6860cb7865859bf40ecf828be65972827646e91ad/time_machine-3.2.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:154bd3f75c81f70218b2585cc12b60762fb2665c507eec5ec5037d8756d9b4e0", size = 33115, upload-time = "2025-12-17T23:32:13.219Z" }, + { url = "https://files.pythonhosted.org/packages/5e/03/78c5d7dfa366924eb4dbfcc3fc917c39a4280ca234b12819cc1f16c03d88/time_machine-3.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d50cfe5ebea422c896ad8d278af9648412b7533b8ea6adeeee698a3fd9b1d3b7", size = 34705, upload-time = "2025-12-17T23:32:14.29Z" }, + { url = "https://files.pythonhosted.org/packages/86/93/d5e877c24541f674c6869ff6e9c56833369796010190252e92c9d7ae5f0f/time_machine-3.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636576501724bd6a9124e69d86e5aef263479e89ef739c5db361469f0463a0a1", size = 36104, upload-time = "2025-12-17T23:32:15.354Z" }, + { url = "https://files.pythonhosted.org/packages/22/1c/d4bae72f388f67efc9609f89b012e434bb19d9549c7a7b47d6c7d9e5c55d/time_machine-3.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:40e6f40c57197fcf7ec32d2c563f4df0a82c42cdcc3cab27f688e98f6060df10", size = 34765, upload-time = "2025-12-17T23:32:16.434Z" }, + { url = "https://files.pythonhosted.org/packages/1d/c3/ac378cf301d527d8dfad2f0db6bad0dfb1ab73212eaa56d6b96ee5d9d20b/time_machine-3.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:a1bcf0b846bbfc19a79bc19e3fa04d8c7b1e8101c1b70340ffdb689cd801ea53", size = 33010, upload-time = "2025-12-17T23:32:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/06/35/7ce897319accda7a6970b288a9a8c52d25227342a7508505a2b3d235b649/time_machine-3.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ae55a56c179f4fe7a62575ad5148b6ed82f6c7e5cf2f9a9ec65f2f5b067db5f5", size = 34185, upload-time = "2025-12-17T23:32:18.566Z" }, + { url = "https://files.pythonhosted.org/packages/67/e7/487f0ba5fe6c58186a5e1af2a118dfa2c160fedb37ef53a7e972d410408e/time_machine-3.2.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:59d71545e62525a4b85b6de9ab5c02ee3c61110fd7f636139914a2335dcbfc9c", size = 20000, upload-time = "2025-12-17T23:32:23.058Z" }, + { url = "https://files.pythonhosted.org/packages/e1/17/eb2c0054c8d44dd42df84ccd434539249a9c7d0b8eb53f799be2102500ab/time_machine-3.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:999672c621c35362bc28e03ca0c7df21500195540773c25993421fd8d6cc5003", size = 15657, upload-time = "2025-12-17T23:32:24.125Z" }, + { url = "https://files.pythonhosted.org/packages/43/21/93443b5d1dd850f8bb9442e90d817a9033dcce6bfbdd3aabbb9786251c80/time_machine-3.2.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5faf7397f0580c7b9d67288522c8d7863e85f0cffadc0f1fccdb2c3dfce5783e", size = 39216, upload-time = "2025-12-17T23:32:25.542Z" }, + { url = "https://files.pythonhosted.org/packages/9f/9e/18544cf8acc72bb1dc03762231c82ecc259733f4bb6770a7bbe5cd138603/time_machine-3.2.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d3dd886ec49f1fa5a00e844f5947e5c0f98ce574750c24b7424c6f77fc1c3e87", size = 40764, upload-time = "2025-12-17T23:32:26.643Z" }, + { url = "https://files.pythonhosted.org/packages/27/f7/9fe9ce2795636a3a7467307af6bdf38bb613ddb701a8a5cd50ec713beb5e/time_machine-3.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0ecd96bc7bbe450acaaabe569d84e81688f1be8ad58d1470e42371d145fb53", size = 43526, upload-time = "2025-12-17T23:32:27.693Z" }, + { url = "https://files.pythonhosted.org/packages/03/c1/a93e975ba9dec22e87ec92d18c28e67d36bd536f9119ffa439b2892b0c9c/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:158220e946c1c4fb8265773a0282c88c35a7e3bb5d78e3561214e3b3231166f3", size = 41727, upload-time = "2025-12-17T23:32:28.985Z" }, + { url = "https://files.pythonhosted.org/packages/5f/fb/e3633e5a6bbed1c76bb2e9810dabc2f8467532ffcd29b9aed404b473061a/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c1aee29bc54356f248d5d7dfdd131e12ca825e850a08c0ebdb022266d073013", size = 38952, upload-time = "2025-12-17T23:32:30.031Z" }, + { url = "https://files.pythonhosted.org/packages/82/3d/02e9fb2526b3d6b1b45bc8e4d912d95d1cd699d1a3f6df985817d37a0600/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c8ed2224f09d25b1c2fc98683613aca12f90f682a427eabb68fc824d27014e4a", size = 39829, upload-time = "2025-12-17T23:32:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/61/70/b4b980d126ed155c78d1879c50d60c8dcbd47bd11cb14ee7be50e0dfc07f/time_machine-3.2.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:1398980c017fe5744d66f419e0115ee48a53b00b146d738e1416c225eb610b82", size = 19303, upload-time = "2025-12-17T23:32:35.796Z" }, + { url = "https://files.pythonhosted.org/packages/73/73/eaa33603c69a68fe2b6f54f9dd75481693d62f1d29676531002be06e2d1c/time_machine-3.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:4f8f4e35f4191ef70c2ab8ff490761ee9051b891afce2bf86dde3918eb7b537b", size = 15431, upload-time = "2025-12-17T23:32:37.244Z" }, + { url = "https://files.pythonhosted.org/packages/76/10/b81e138e86cc7bab40cdb59d294b341e172201f4a6c84bb0ec080407977a/time_machine-3.2.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6db498686ecf6163c5aa8cf0bcd57bbe0f4081184f247edf3ee49a2612b584f9", size = 33206, upload-time = "2025-12-17T23:32:38.713Z" }, + { url = "https://files.pythonhosted.org/packages/d3/72/4deab446b579e8bd5dca91de98595c5d6bd6a17ce162abf5c5f2ce40d3d8/time_machine-3.2.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:027c1807efb74d0cd58ad16524dec94212fbe900115d70b0123399883657ac0f", size = 34792, upload-time = "2025-12-17T23:32:40.223Z" }, + { url = "https://files.pythonhosted.org/packages/2c/39/439c6b587ddee76d533fe972289d0646e0a5520e14dc83d0a30aeb5565f7/time_machine-3.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92432610c05676edd5e6946a073c6f0c926923123ce7caee1018dc10782c713d", size = 36187, upload-time = "2025-12-17T23:32:41.705Z" }, + { url = "https://files.pythonhosted.org/packages/4b/db/2da4368db15180989bab83746a857bde05ad16e78f326801c142bb747a06/time_machine-3.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c25586b62480eb77ef3d953fba273209478e1ef49654592cd6a52a68dfe56a67", size = 34855, upload-time = "2025-12-17T23:32:42.817Z" }, + { url = "https://files.pythonhosted.org/packages/88/84/120a431fee50bc4c241425bee4d3a4910df4923b7ab5f7dff1bf0c772f08/time_machine-3.2.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6bf3a2fa738d15e0b95d14469a0b8ea42635467408d8b490e263d5d45c9a177f", size = 33222, upload-time = "2025-12-17T23:32:43.94Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ea/89cfda82bb8c57ff91bb9a26751aa234d6d90e9b4d5ab0ad9dce0f9f0329/time_machine-3.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ce76b82276d7ad2a66cdc85dad4df19d1422b69183170a34e8fbc4c3f35502f7", size = 34270, upload-time = "2025-12-17T23:32:45.037Z" }, + { url = "https://files.pythonhosted.org/packages/86/a1/142de946dc4393f910bf4564b5c3ba819906e1f49b06c9cb557519c849e4/time_machine-3.2.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4e374779021446fc2b5c29d80457ec9a3b1a5df043dc2aae07d7c1415d52323c", size = 19991, upload-time = "2025-12-17T23:32:49.933Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/7f17def6289901f94726921811a16b9adce46e666362c75d45730c60274f/time_machine-3.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:122310a6af9c36e9a636da32830e591e7923e8a07bdd0a43276c3a36c6821c90", size = 15707, upload-time = "2025-12-17T23:32:50.969Z" }, + { url = "https://files.pythonhosted.org/packages/5d/d3/3502fb9bd3acb159c18844b26c43220201a0d4a622c0c853785d07699a92/time_machine-3.2.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ba3eeb0f018cc362dd8128befa3426696a2e16dd223c3fb695fde184892d4d8c", size = 39207, upload-time = "2025-12-17T23:32:52.033Z" }, + { url = "https://files.pythonhosted.org/packages/5a/be/8b27f4aa296fda14a5a2ad7f588ddd450603c33415ab3f8e85b2f1a44678/time_machine-3.2.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:77d38ba664b381a7793f8786efc13b5004f0d5f672dae814430445b8202a67a6", size = 40764, upload-time = "2025-12-17T23:32:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/42/cd/fe4c4e5c8ab6d48fab3624c32be9116fb120173a35fe67e482e5cf68b3d2/time_machine-3.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f09abeb8f03f044d72712207e0489a62098ad3ad16dac38927fcf80baca4d6a7", size = 43508, upload-time = "2025-12-17T23:32:54.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/28/5a3ba2fce85b97655a425d6bb20a441550acd2b304c96b2c19d3839f721a/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6b28367ce4f73987a55e230e1d30a57a3af85da8eb1a140074eb6e8c7e6ef19f", size = 41712, upload-time = "2025-12-17T23:32:55.781Z" }, + { url = "https://files.pythonhosted.org/packages/81/58/e38084be7fdabb4835db68a3a47e58c34182d79fc35df1ecbe0db2c5359f/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:903c7751c904581da9f7861c3015bed7cdc40047321291d3694a3cdc783bbca3", size = 38939, upload-time = "2025-12-17T23:32:56.867Z" }, + { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" }, +] + [[package]] name = "tinytag" version = "2.2.1" @@ -5474,18 +5553,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/eb/d5583a11486211f3ebd4b385545ae787f32363d453c19fffd81106c9c138/whitenoise-6.12.0-py3-none-any.whl", hash = "sha256:fc5e8c572e33ebf24795b47b6a7da8da3c00cff2349f5b04c02f28d0cc5a3cc2", size = 20302, upload-time = "2026-02-27T00:05:40.086Z" }, ] -[[package]] -name = "whoosh-reloaded" -version = "2.7.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cached-property", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/17/51/3fb4b9fdeaaf96512514ccf2871186333ce41a0de2ea48236a4056a5f6af/Whoosh-Reloaded-2.7.5.tar.gz", hash = "sha256:39ed7dfbd1fec97af33933107bdf78110728375ed0f2abb25dec6dbfdcb279d8", size = 1061606, upload-time = "2024-02-02T20:06:42.285Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/90/866dfe421f188217ecd7339585e961034a7f4fdc96b62cec3b40a50dbdef/Whoosh_Reloaded-2.7.5-py2.py3-none-any.whl", hash = "sha256:2ab6aeeafb359fbff4beb3c704b960fd88240354f3363f1c5bdb5c2325cae80e", size = 551793, upload-time = "2024-02-02T20:06:39.868Z" }, -] - [[package]] name = "wrapt" version = "2.0.1"