mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-30 21:02:45 +00:00
Compare commits
20 Commits
ci-sa
...
feature-ta
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae494d4b6a | ||
|
|
fdf08bdc43 | ||
|
|
b10f3de2eb | ||
|
|
b626f5602c | ||
|
|
7f63259f41 | ||
|
|
a213c2cc9b | ||
|
|
34d2897ab1 | ||
|
|
50f6b2d4c3 | ||
|
|
9df2a603b7 | ||
|
|
fcd4d28f37 | ||
|
|
0fb57205db | ||
|
|
0078ef9cd5 | ||
|
|
957049c512 | ||
|
|
33da63c229 | ||
|
|
cbeb7469a1 | ||
|
|
2cf85d9b58 | ||
|
|
494d17e7ac | ||
|
|
e8fe3a6a62 | ||
|
|
884edd6eea | ||
|
|
d00fb4f345 |
@@ -3,9 +3,14 @@
|
||||
|
||||
declare -r log_prefix="[init-index]"
|
||||
|
||||
declare -r index_version=9
|
||||
# Version 1: Tantivy backend (replaces Whoosh; resets versioning from scratch)
|
||||
declare -r index_version=1
|
||||
declare -r data_dir="${PAPERLESS_DATA_DIR:-/usr/src/paperless/data}"
|
||||
declare -r index_version_file="${data_dir}/.index_version"
|
||||
declare -r index_language_file="${data_dir}/.index_language"
|
||||
# Track the raw env var (not the resolved language) so inference changes
|
||||
# don't cause spurious reindexes — only explicit setting changes trigger one.
|
||||
declare -r search_language="${PAPERLESS_SEARCH_LANGUAGE:-}"
|
||||
|
||||
update_index () {
|
||||
echo "${log_prefix} Search index out of date. Updating..."
|
||||
@@ -13,16 +18,24 @@ update_index () {
|
||||
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
|
||||
python3 manage.py document_index reindex --no-progress-bar
|
||||
echo ${index_version} | tee "${index_version_file}" > /dev/null
|
||||
echo "${search_language}" | tee "${index_language_file}" > /dev/null
|
||||
else
|
||||
s6-setuidgid paperless python3 manage.py document_index reindex --no-progress-bar
|
||||
echo ${index_version} | s6-setuidgid paperless tee "${index_version_file}" > /dev/null
|
||||
echo "${search_language}" | s6-setuidgid paperless tee "${index_language_file}" > /dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ (! -f "${index_version_file}") ]]; then
|
||||
if [[ ! -f "${index_version_file}" ]]; then
|
||||
echo "${log_prefix} No index version file found"
|
||||
update_index
|
||||
elif [[ $(<"${index_version_file}") != "$index_version" ]]; then
|
||||
echo "${log_prefix} index version updated"
|
||||
elif [[ $(<"${index_version_file}") != "${index_version}" ]]; then
|
||||
echo "${log_prefix} Index version updated"
|
||||
update_index
|
||||
elif [[ ! -f "${index_language_file}" ]]; then
|
||||
echo "${log_prefix} No language file found"
|
||||
update_index
|
||||
elif [[ $(<"${index_language_file}") != "${search_language}" ]]; then
|
||||
echo "${log_prefix} Search language changed"
|
||||
update_index
|
||||
fi
|
||||
|
||||
@@ -180,6 +180,17 @@ following:
|
||||
This might not actually do anything. Not every new paperless version
|
||||
comes with new database migrations.
|
||||
|
||||
4. Rebuild the search index.
|
||||
|
||||
```shell-session
|
||||
cd src
|
||||
python3 manage.py document_index reindex
|
||||
```
|
||||
|
||||
This is required when the search backend has changed (e.g. the upgrade
|
||||
to Tantivy). It is safe to run on every upgrade — if the index is already
|
||||
current it completes quickly.
|
||||
|
||||
### Database Upgrades
|
||||
|
||||
Paperless-ngx is compatible with Django-supported versions of PostgreSQL and MariaDB and it is generally
|
||||
@@ -459,11 +470,31 @@ document_index {reindex,optimize}
|
||||
Specify `reindex` to have the index created from scratch. This may take
|
||||
some time.
|
||||
|
||||
Specify `optimize` to optimize the index. This updates certain aspects
|
||||
of the index and usually makes queries faster and also ensures that the
|
||||
autocompletion works properly. This command is regularly invoked by the
|
||||
Specify `optimize` to optimize the index. This command is regularly invoked by the
|
||||
task scheduler.
|
||||
|
||||
!!! note
|
||||
|
||||
The `optimize` subcommand is deprecated and is now a no-op. Tantivy manages
|
||||
segment merging automatically; no manual optimization step is needed.
|
||||
|
||||
!!! note
|
||||
|
||||
**Docker users:** On first startup after upgrading, the container automatically
|
||||
detects the index format change and runs a full reindex before starting the
|
||||
webserver. No manual step is required.
|
||||
|
||||
**Bare metal users:** After upgrading, run the following command once to rebuild
|
||||
the search index in the new format:
|
||||
|
||||
```shell-session
|
||||
cd src
|
||||
python3 manage.py document_index reindex
|
||||
```
|
||||
|
||||
Changing `PAPERLESS_SEARCH_LANGUAGE` also requires a manual reindex on bare
|
||||
metal (Docker handles this automatically).
|
||||
|
||||
### Clearing the database read cache
|
||||
|
||||
If the database read cache is enabled, **you must run this command** after making any changes to the database outside the application context.
|
||||
|
||||
@@ -167,9 +167,8 @@ Query parameters:
|
||||
- `term`: The incomplete term.
|
||||
- `limit`: Amount of results. Defaults to 10.
|
||||
|
||||
Results returned by the endpoint are ordered by importance of the term
|
||||
in the document index. The first result is the term that has the highest
|
||||
[Tf/Idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) score in the index.
|
||||
Results are ordered by how many of the user's visible documents contain
|
||||
each matching word. The first result is the word that appears in the most documents.
|
||||
|
||||
```json
|
||||
["term1", "term3", "term6", "term4"]
|
||||
|
||||
@@ -1103,6 +1103,32 @@ should be a valid crontab(5) expression describing when to run.
|
||||
|
||||
Defaults to `0 0 * * *` or daily at midnight.
|
||||
|
||||
#### [`PAPERLESS_SEARCH_LANGUAGE=<language>`](#PAPERLESS_SEARCH_LANGUAGE) {#PAPERLESS_SEARCH_LANGUAGE}
|
||||
|
||||
: Sets the stemmer language for the full-text search index.
|
||||
Stemming improves recall by matching word variants (e.g. "running" matches "run").
|
||||
Changing this setting causes the index to be rebuilt automatically on next startup.
|
||||
An invalid value raises an error at startup.
|
||||
|
||||
: Use the ISO 639-1 two-letter code (e.g. `en`, `de`, `fr`). Lowercase full names
|
||||
(e.g. `english`, `german`, `french`) are also accepted. The capitalized names shown
|
||||
in the [Tantivy Language enum](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html)
|
||||
documentation are **not** valid — use the lowercase equivalent.
|
||||
|
||||
: If not set, paperless infers the language from
|
||||
[`PAPERLESS_OCR_LANGUAGE`](#PAPERLESS_OCR_LANGUAGE). If the OCR language has no
|
||||
Tantivy stemmer equivalent, stemming is disabled.
|
||||
|
||||
Defaults to unset (inferred from `PAPERLESS_OCR_LANGUAGE`).
|
||||
|
||||
#### [`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD=<float>`](#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD) {#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD}
|
||||
|
||||
: When set to a float value, approximate/fuzzy matching is applied alongside exact
|
||||
matching. Fuzzy results rank below exact matches. A value of `0.5` is a reasonable
|
||||
starting point. Leave unset to disable fuzzy matching entirely.
|
||||
|
||||
Defaults to unset (disabled).
|
||||
|
||||
#### [`PAPERLESS_SANITY_TASK_CRON=<cron expression>`](#PAPERLESS_SANITY_TASK_CRON) {#PAPERLESS_SANITY_TASK_CRON}
|
||||
|
||||
: Configures the scheduled sanity checker frequency. The value should be a
|
||||
|
||||
@@ -839,18 +839,28 @@ Matching inexact words:
|
||||
produ*name
|
||||
```
|
||||
|
||||
!!! note
|
||||
Matching natural date keywords:
|
||||
|
||||
Inexact terms are hard for search indexes. These queries might take a
|
||||
while to execute. That's why paperless offers auto complete and query
|
||||
correction.
|
||||
```
|
||||
added:today
|
||||
modified:yesterday
|
||||
created:this_week
|
||||
added:last_month
|
||||
modified:this_year
|
||||
```
|
||||
|
||||
Supported date keywords: `today`, `yesterday`, `this_week`, `last_week`,
|
||||
`this_month`, `last_month`, `this_year`, `last_year`.
|
||||
|
||||
All of these constructs can be combined as you see fit. If you want to
|
||||
learn more about the query language used by paperless, paperless uses
|
||||
Whoosh's default query language. Head over to [Whoosh query
|
||||
language](https://whoosh.readthedocs.io/en/latest/querylang.html). For
|
||||
details on what date parsing utilities are available, see [Date
|
||||
parsing](https://whoosh.readthedocs.io/en/latest/dates.html#parsing-date-queries).
|
||||
learn more about the query language used by paperless, see the
|
||||
[Tantivy query language documentation](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
|
||||
|
||||
!!! note
|
||||
|
||||
Fuzzy (approximate) matching can be enabled by setting
|
||||
[`PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD`](configuration.md#PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD).
|
||||
When enabled, paperless will include near-miss results ranked below exact matches.
|
||||
|
||||
## Keyboard shortcuts / hotkeys
|
||||
|
||||
|
||||
@@ -75,11 +75,11 @@ dependencies = [
|
||||
"scikit-learn~=1.8.0",
|
||||
"sentence-transformers>=4.1",
|
||||
"setproctitle~=1.3.4",
|
||||
"tantivy>=0.25.1",
|
||||
"tika-client~=0.10.0",
|
||||
"torch~=2.10.0",
|
||||
"watchfiles>=1.1.1",
|
||||
"whitenoise~=6.11",
|
||||
"whoosh-reloaded>=2.7.5",
|
||||
"zxing-cpp~=3.0.0",
|
||||
]
|
||||
|
||||
@@ -123,6 +123,7 @@ testing = [
|
||||
"pytest-rerunfailures~=16.1",
|
||||
"pytest-sugar",
|
||||
"pytest-xdist~=3.8.0",
|
||||
"time-machine>=2.13",
|
||||
]
|
||||
|
||||
lint = [
|
||||
@@ -299,6 +300,7 @@ markers = [
|
||||
"greenmail: Tests requiring Greenmail service",
|
||||
"date_parsing: Tests which cover date parsing from content or filename",
|
||||
"management: Tests which cover management commands/functionality",
|
||||
"search: Tests for the Tantivy search backend",
|
||||
]
|
||||
|
||||
[tool.pytest_env]
|
||||
|
||||
@@ -100,24 +100,23 @@ class DocumentAdmin(GuardedModelAdmin):
|
||||
return Document.global_objects.all()
|
||||
|
||||
def delete_queryset(self, request, queryset):
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
with get_backend().batch_update() as batch:
|
||||
for o in queryset:
|
||||
index.remove_document(writer, o)
|
||||
|
||||
batch.remove(o.pk)
|
||||
super().delete_queryset(request, queryset)
|
||||
|
||||
def delete_model(self, request, obj):
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.remove_document_from_index(obj)
|
||||
get_backend().remove(obj.pk)
|
||||
super().delete_model(request, obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.add_or_update_document(obj)
|
||||
get_backend().add_or_update(obj)
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
|
||||
|
||||
@@ -349,11 +349,11 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
|
||||
|
||||
Document.objects.filter(id__in=delete_ids).delete()
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
with get_backend().batch_update() as batch:
|
||||
for id in delete_ids:
|
||||
index.remove_document_by_id(writer, id)
|
||||
batch.remove(id)
|
||||
|
||||
status_mgr = DocumentsStatusManager()
|
||||
status_mgr.send_documents_deleted(delete_ids)
|
||||
|
||||
@@ -1,655 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from collections import Counter
|
||||
from contextlib import contextmanager
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from datetime import time
|
||||
from datetime import timedelta
|
||||
from shutil import rmtree
|
||||
from time import sleep
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Literal
|
||||
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from django.conf import settings
|
||||
from django.utils import timezone as django_timezone
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from django.utils.timezone import now
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
from whoosh import classify
|
||||
from whoosh import highlight
|
||||
from whoosh import query
|
||||
from whoosh.fields import BOOLEAN
|
||||
from whoosh.fields import DATETIME
|
||||
from whoosh.fields import KEYWORD
|
||||
from whoosh.fields import NUMERIC
|
||||
from whoosh.fields import TEXT
|
||||
from whoosh.fields import Schema
|
||||
from whoosh.highlight import HtmlFormatter
|
||||
from whoosh.idsets import BitSet
|
||||
from whoosh.idsets import DocIdSet
|
||||
from whoosh.index import FileIndex
|
||||
from whoosh.index import LockError
|
||||
from whoosh.index import create_in
|
||||
from whoosh.index import exists_in
|
||||
from whoosh.index import open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.qparser import QueryParser
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.qparser.dateparse import English
|
||||
from whoosh.qparser.plugins import FieldsPlugin
|
||||
from whoosh.scoring import TF_IDF
|
||||
from whoosh.util.times import timespan
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.models import User
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
from whoosh.reading import IndexReader
|
||||
from whoosh.searching import ResultsPage
|
||||
from whoosh.searching import Searcher
|
||||
|
||||
logger = logging.getLogger("paperless.index")
|
||||
|
||||
|
||||
def get_schema() -> Schema:
|
||||
return Schema(
|
||||
id=NUMERIC(stored=True, unique=True),
|
||||
title=TEXT(sortable=True),
|
||||
content=TEXT(),
|
||||
asn=NUMERIC(sortable=True, signed=False),
|
||||
correspondent=TEXT(sortable=True),
|
||||
correspondent_id=NUMERIC(),
|
||||
has_correspondent=BOOLEAN(),
|
||||
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
|
||||
tag_id=KEYWORD(commas=True, scorable=True),
|
||||
has_tag=BOOLEAN(),
|
||||
type=TEXT(sortable=True),
|
||||
type_id=NUMERIC(),
|
||||
has_type=BOOLEAN(),
|
||||
created=DATETIME(sortable=True),
|
||||
modified=DATETIME(sortable=True),
|
||||
added=DATETIME(sortable=True),
|
||||
path=TEXT(sortable=True),
|
||||
path_id=NUMERIC(),
|
||||
has_path=BOOLEAN(),
|
||||
notes=TEXT(),
|
||||
num_notes=NUMERIC(sortable=True, signed=False),
|
||||
custom_fields=TEXT(),
|
||||
custom_field_count=NUMERIC(sortable=True, signed=False),
|
||||
has_custom_fields=BOOLEAN(),
|
||||
custom_fields_id=KEYWORD(commas=True),
|
||||
owner=TEXT(),
|
||||
owner_id=NUMERIC(),
|
||||
has_owner=BOOLEAN(),
|
||||
viewer_id=KEYWORD(commas=True),
|
||||
checksum=TEXT(),
|
||||
page_count=NUMERIC(sortable=True),
|
||||
original_filename=TEXT(sortable=True),
|
||||
is_shared=BOOLEAN(),
|
||||
)
|
||||
|
||||
|
||||
def open_index(*, recreate=False) -> FileIndex:
|
||||
transient_exceptions = (FileNotFoundError, LockError)
|
||||
max_retries = 3
|
||||
retry_delay = 0.1
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
break
|
||||
except transient_exceptions as exc:
|
||||
is_last_attempt = attempt == max_retries or recreate
|
||||
if is_last_attempt:
|
||||
logger.exception(
|
||||
"Error while opening the index after retries, recreating.",
|
||||
)
|
||||
break
|
||||
|
||||
logger.warning(
|
||||
"Transient error while opening the index (attempt %s/%s): %s. Retrying.",
|
||||
attempt + 1,
|
||||
max_retries + 1,
|
||||
exc,
|
||||
)
|
||||
sleep(retry_delay)
|
||||
except Exception:
|
||||
logger.exception("Error while opening the index, recreating.")
|
||||
break
|
||||
|
||||
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first
|
||||
if settings.INDEX_DIR.is_dir():
|
||||
rmtree(settings.INDEX_DIR)
|
||||
settings.INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_writer(*, optimize=False) -> AsyncWriter:
|
||||
writer = AsyncWriter(open_index())
|
||||
|
||||
try:
|
||||
yield writer
|
||||
except Exception as e:
|
||||
logger.exception(str(e))
|
||||
writer.cancel()
|
||||
finally:
|
||||
writer.commit(optimize=optimize)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_searcher() -> Searcher:
|
||||
searcher = open_index().searcher()
|
||||
|
||||
try:
|
||||
yield searcher
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
||||
def update_document(
|
||||
writer: AsyncWriter,
|
||||
doc: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
|
||||
notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)])
|
||||
custom_fields = ",".join(
|
||||
[str(c) for c in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
custom_fields_ids = ",".join(
|
||||
[str(f.field.id) for f in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
asn: int | None = doc.archive_serial_number
|
||||
if asn is not None and (
|
||||
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
logger.error(
|
||||
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
|
||||
f"ASN is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
|
||||
)
|
||||
asn = 0
|
||||
users_with_perms = get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["view_document"],
|
||||
)
|
||||
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=effective_content or doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
||||
has_correspondent=doc.correspondent is not None,
|
||||
tag=tags if tags else None,
|
||||
tag_id=tags_ids if tags_ids else None,
|
||||
has_tag=len(tags) > 0,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
type_id=doc.document_type.id if doc.document_type else None,
|
||||
has_type=doc.document_type is not None,
|
||||
created=datetime.combine(doc.created, time.min),
|
||||
added=doc.added,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
has_path=doc.storage_path is not None,
|
||||
notes=notes,
|
||||
num_notes=len(notes),
|
||||
custom_fields=custom_fields,
|
||||
custom_field_count=len(doc.custom_fields.all()),
|
||||
has_custom_fields=len(custom_fields) > 0,
|
||||
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
||||
owner=doc.owner.username if doc.owner else None,
|
||||
owner_id=doc.owner.id if doc.owner else None,
|
||||
has_owner=doc.owner is not None,
|
||||
viewer_id=viewer_ids if viewer_ids else None,
|
||||
checksum=doc.checksum,
|
||||
page_count=doc.page_count,
|
||||
original_filename=doc.original_filename,
|
||||
is_shared=len(viewer_ids) > 0,
|
||||
)
|
||||
logger.debug(f"Index updated for document {doc.pk}.")
|
||||
|
||||
|
||||
def remove_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
remove_document_by_id(writer, doc.pk)
|
||||
|
||||
|
||||
def remove_document_by_id(writer: AsyncWriter, doc_id) -> None:
|
||||
writer.delete_by_term("id", doc_id)
|
||||
|
||||
|
||||
def add_or_update_document(
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
with open_index_writer() as writer:
|
||||
update_document(writer, document, effective_content=effective_content)
|
||||
|
||||
|
||||
def remove_document_from_index(document: Document) -> None:
|
||||
with open_index_writer() as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
class MappedDocIdSet(DocIdSet):
|
||||
"""
|
||||
A DocIdSet backed by a set of `Document` IDs.
|
||||
Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`.
|
||||
"""
|
||||
|
||||
def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None:
|
||||
super().__init__()
|
||||
document_ids = filter_queryset.order_by("id").values_list("id", flat=True)
|
||||
max_id = document_ids.last() or 0
|
||||
self.document_ids = BitSet(document_ids, size=max_id)
|
||||
self.ixreader = ixreader
|
||||
|
||||
def __contains__(self, docnum) -> bool:
|
||||
document_id = self.ixreader.stored_fields(docnum)["id"]
|
||||
return document_id in self.document_ids
|
||||
|
||||
def __bool__(self) -> Literal[True]:
|
||||
# searcher.search ignores a filter if it's "falsy".
|
||||
# We use this hack so this DocIdSet, when used as a filter, is never ignored.
|
||||
return True
|
||||
|
||||
|
||||
class DelayedQuery:
|
||||
def _get_query(self):
|
||||
raise NotImplementedError # pragma: no cover
|
||||
|
||||
def _get_query_sortedby(self) -> tuple[None, Literal[False]] | tuple[str, bool]:
|
||||
if "ordering" not in self.query_params:
|
||||
return None, False
|
||||
|
||||
field: str = self.query_params["ordering"]
|
||||
|
||||
sort_fields_map: dict[str, str] = {
|
||||
"created": "created",
|
||||
"modified": "modified",
|
||||
"added": "added",
|
||||
"title": "title",
|
||||
"correspondent__name": "correspondent",
|
||||
"document_type__name": "type",
|
||||
"archive_serial_number": "asn",
|
||||
"num_notes": "num_notes",
|
||||
"owner": "owner",
|
||||
"page_count": "page_count",
|
||||
}
|
||||
|
||||
if field.startswith("-"):
|
||||
field = field[1:]
|
||||
reverse = True
|
||||
else:
|
||||
reverse = False
|
||||
|
||||
if field not in sort_fields_map:
|
||||
return None, False
|
||||
else:
|
||||
return sort_fields_map[field], reverse
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
searcher: Searcher,
|
||||
query_params,
|
||||
page_size,
|
||||
filter_queryset: QuerySet,
|
||||
) -> None:
|
||||
self.searcher = searcher
|
||||
self.query_params = query_params
|
||||
self.page_size = page_size
|
||||
self.saved_results = dict()
|
||||
self.first_score = None
|
||||
self.filter_queryset = filter_queryset
|
||||
self.suggested_correction = None
|
||||
self._manual_hits_cache: list | None = None
|
||||
|
||||
def __len__(self) -> int:
|
||||
if self._manual_sort_requested():
|
||||
manual_hits = self._manual_hits()
|
||||
return len(manual_hits)
|
||||
|
||||
page = self[0:1]
|
||||
return len(page)
|
||||
|
||||
def _manual_sort_requested(self):
|
||||
ordering = self.query_params.get("ordering", "")
|
||||
return ordering.lstrip("-").startswith("custom_field_")
|
||||
|
||||
def _manual_hits(self):
|
||||
if self._manual_hits_cache is None:
|
||||
q, mask, suggested_correction = self._get_query()
|
||||
self.suggested_correction = suggested_correction
|
||||
|
||||
results = self.searcher.search(
|
||||
q,
|
||||
mask=mask,
|
||||
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||
limit=None,
|
||||
)
|
||||
results.fragmenter = highlight.ContextFragmenter(surround=50)
|
||||
results.formatter = HtmlFormatter(tagname="span", between=" ... ")
|
||||
|
||||
if not self.first_score and len(results) > 0:
|
||||
self.first_score = results[0].score
|
||||
|
||||
if self.first_score:
|
||||
results.top_n = [
|
||||
(
|
||||
(hit[0] / self.first_score) if self.first_score else None,
|
||||
hit[1],
|
||||
)
|
||||
for hit in results.top_n
|
||||
]
|
||||
|
||||
hits_by_id = {hit["id"]: hit for hit in results}
|
||||
matching_ids = list(hits_by_id.keys())
|
||||
|
||||
ordered_ids = list(
|
||||
self.filter_queryset.filter(id__in=matching_ids).values_list(
|
||||
"id",
|
||||
flat=True,
|
||||
),
|
||||
)
|
||||
ordered_ids = list(dict.fromkeys(ordered_ids))
|
||||
|
||||
self._manual_hits_cache = [
|
||||
hits_by_id[_id] for _id in ordered_ids if _id in hits_by_id
|
||||
]
|
||||
return self._manual_hits_cache
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item.start in self.saved_results:
|
||||
return self.saved_results[item.start]
|
||||
|
||||
if self._manual_sort_requested():
|
||||
manual_hits = self._manual_hits()
|
||||
start = 0 if item.start is None else item.start
|
||||
stop = item.stop
|
||||
hits = manual_hits[start:stop] if stop is not None else manual_hits[start:]
|
||||
page = ManualResultsPage(hits)
|
||||
self.saved_results[start] = page
|
||||
return page
|
||||
|
||||
q, mask, suggested_correction = self._get_query()
|
||||
self.suggested_correction = suggested_correction
|
||||
sortedby, reverse = self._get_query_sortedby()
|
||||
|
||||
page: ResultsPage = self.searcher.search_page(
|
||||
q,
|
||||
mask=mask,
|
||||
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||
pagenum=math.floor(item.start / self.page_size) + 1,
|
||||
pagelen=self.page_size,
|
||||
sortedby=sortedby,
|
||||
reverse=reverse,
|
||||
)
|
||||
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
|
||||
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
|
||||
|
||||
if not self.first_score and len(page.results) > 0 and sortedby is None:
|
||||
self.first_score = page.results[0].score
|
||||
|
||||
page.results.top_n = [
|
||||
(
|
||||
(hit[0] / self.first_score) if self.first_score else None,
|
||||
hit[1],
|
||||
)
|
||||
for hit in page.results.top_n
|
||||
]
|
||||
|
||||
self.saved_results[item.start] = page
|
||||
|
||||
return page
|
||||
|
||||
|
||||
class ManualResultsPage(list):
|
||||
def __init__(self, hits) -> None:
|
||||
super().__init__(hits)
|
||||
self.results = ManualResults(hits)
|
||||
|
||||
|
||||
class ManualResults:
|
||||
def __init__(self, hits) -> None:
|
||||
self._docnums = [hit.docnum for hit in hits]
|
||||
|
||||
def docs(self):
|
||||
return self._docnums
|
||||
|
||||
|
||||
class LocalDateParser(English):
|
||||
def reverse_timezone_offset(self, d):
|
||||
return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone(
|
||||
UTC,
|
||||
)
|
||||
|
||||
def date_from(self, *args, **kwargs):
|
||||
d = super().date_from(*args, **kwargs)
|
||||
if isinstance(d, timespan):
|
||||
d.start = self.reverse_timezone_offset(d.start)
|
||||
d.end = self.reverse_timezone_offset(d.end)
|
||||
elif isinstance(d, datetime):
|
||||
d = self.reverse_timezone_offset(d)
|
||||
return d
|
||||
|
||||
|
||||
class DelayedFullTextQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
q_str = self.query_params["query"]
|
||||
q_str = rewrite_natural_date_keywords(q_str)
|
||||
qp = MultifieldParser(
|
||||
[
|
||||
"content",
|
||||
"title",
|
||||
"correspondent",
|
||||
"tag",
|
||||
"type",
|
||||
"notes",
|
||||
"custom_fields",
|
||||
],
|
||||
self.searcher.ixreader.schema,
|
||||
)
|
||||
qp.add_plugin(
|
||||
DateParserPlugin(
|
||||
basedate=django_timezone.now(),
|
||||
dateparser=LocalDateParser(),
|
||||
),
|
||||
)
|
||||
q = qp.parse(q_str)
|
||||
suggested_correction = None
|
||||
try:
|
||||
corrected = self.searcher.correct_query(q, q_str)
|
||||
if corrected.string != q_str:
|
||||
corrected_results = self.searcher.search(
|
||||
corrected.query,
|
||||
limit=1,
|
||||
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||
scored=False,
|
||||
)
|
||||
if len(corrected_results) > 0:
|
||||
suggested_correction = corrected.string
|
||||
except Exception as e:
|
||||
logger.info(
|
||||
"Error while correcting query %s: %s",
|
||||
f"{q_str!r}",
|
||||
e,
|
||||
)
|
||||
|
||||
return q, None, suggested_correction
|
||||
|
||||
|
||||
class DelayedMoreLikeThisQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
more_like_doc_id = int(self.query_params["more_like_id"])
|
||||
content = Document.objects.get(id=more_like_doc_id).content
|
||||
|
||||
docnum = self.searcher.document_number(id=more_like_doc_id)
|
||||
kts = self.searcher.key_terms_from_text(
|
||||
"content",
|
||||
content,
|
||||
numterms=20,
|
||||
model=classify.Bo1Model,
|
||||
normalize=False,
|
||||
)
|
||||
q = query.Or(
|
||||
[query.Term("content", word, boost=weight) for word, weight in kts],
|
||||
)
|
||||
mask: set = {docnum}
|
||||
|
||||
return q, mask, None
|
||||
|
||||
|
||||
def autocomplete(
|
||||
ix: FileIndex,
|
||||
term: str,
|
||||
limit: int = 10,
|
||||
user: User | None = None,
|
||||
) -> list:
|
||||
"""
|
||||
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
||||
and without scoring
|
||||
"""
|
||||
terms = []
|
||||
|
||||
with ix.searcher(weighting=TF_IDF()) as s:
|
||||
qp = QueryParser("content", schema=ix.schema)
|
||||
# Don't let searches with a query that happen to match a field override the
|
||||
# content field query instead and return bogus, not text data
|
||||
qp.remove_plugin_class(FieldsPlugin)
|
||||
q = qp.parse(f"{term.lower()}*")
|
||||
user_criterias: list = get_permissions_criterias(user)
|
||||
|
||||
results = s.search(
|
||||
q,
|
||||
terms=True,
|
||||
filter=query.Or(user_criterias) if user_criterias is not None else None,
|
||||
)
|
||||
|
||||
termCounts = Counter()
|
||||
if results.has_matched_terms():
|
||||
for hit in results:
|
||||
for _, match in hit.matched_terms():
|
||||
termCounts[match] += 1
|
||||
terms = [t for t, _ in termCounts.most_common(limit)]
|
||||
|
||||
term_encoded: bytes = term.encode("UTF-8")
|
||||
if term_encoded in terms:
|
||||
terms.insert(0, terms.pop(terms.index(term_encoded)))
|
||||
|
||||
return terms
|
||||
|
||||
|
||||
def get_permissions_criterias(user: User | None = None) -> list:
|
||||
user_criterias = [query.Term("has_owner", text=False)]
|
||||
if user is not None:
|
||||
if user.is_superuser: # superusers see all docs
|
||||
user_criterias = []
|
||||
else:
|
||||
user_criterias.append(query.Term("owner_id", user.id))
|
||||
user_criterias.append(
|
||||
query.Term("viewer_id", str(user.id)),
|
||||
)
|
||||
return user_criterias
|
||||
|
||||
|
||||
def rewrite_natural_date_keywords(query_string: str) -> str:
|
||||
"""
|
||||
Rewrites natural date keywords (e.g. added:today or added:"yesterday") to UTC range syntax for Whoosh.
|
||||
This resolves timezone issues with date parsing in Whoosh as well as adding support for more
|
||||
natural date keywords.
|
||||
"""
|
||||
|
||||
tz = get_current_timezone()
|
||||
local_now = now().astimezone(tz)
|
||||
today = local_now.date()
|
||||
|
||||
# all supported Keywords
|
||||
pattern = r"(\b(?:added|created|modified))\s*:\s*[\"']?(today|yesterday|this month|previous month|previous week|previous quarter|this year|previous year)[\"']?"
|
||||
|
||||
def repl(m):
|
||||
field = m.group(1)
|
||||
keyword = m.group(2).lower()
|
||||
|
||||
match keyword:
|
||||
case "today":
|
||||
start = datetime.combine(today, time.min, tzinfo=tz)
|
||||
end = datetime.combine(today, time.max, tzinfo=tz)
|
||||
|
||||
case "yesterday":
|
||||
yesterday = today - timedelta(days=1)
|
||||
start = datetime.combine(yesterday, time.min, tzinfo=tz)
|
||||
end = datetime.combine(yesterday, time.max, tzinfo=tz)
|
||||
|
||||
case "this month":
|
||||
start = datetime(local_now.year, local_now.month, 1, 0, 0, 0, tzinfo=tz)
|
||||
end = start + relativedelta(months=1) - timedelta(seconds=1)
|
||||
|
||||
case "previous month":
|
||||
this_month_start = datetime(
|
||||
local_now.year,
|
||||
local_now.month,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
tzinfo=tz,
|
||||
)
|
||||
start = this_month_start - relativedelta(months=1)
|
||||
end = this_month_start - timedelta(seconds=1)
|
||||
|
||||
case "this year":
|
||||
start = datetime(local_now.year, 1, 1, 0, 0, 0, tzinfo=tz)
|
||||
end = datetime(local_now.year, 12, 31, 23, 59, 59, tzinfo=tz)
|
||||
|
||||
case "previous week":
|
||||
days_since_monday = local_now.weekday()
|
||||
this_week_start = datetime.combine(
|
||||
today - timedelta(days=days_since_monday),
|
||||
time.min,
|
||||
tzinfo=tz,
|
||||
)
|
||||
start = this_week_start - timedelta(days=7)
|
||||
end = this_week_start - timedelta(seconds=1)
|
||||
|
||||
case "previous quarter":
|
||||
current_quarter = (local_now.month - 1) // 3 + 1
|
||||
this_quarter_start_month = (current_quarter - 1) * 3 + 1
|
||||
this_quarter_start = datetime(
|
||||
local_now.year,
|
||||
this_quarter_start_month,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
tzinfo=tz,
|
||||
)
|
||||
start = this_quarter_start - relativedelta(months=3)
|
||||
end = this_quarter_start - timedelta(seconds=1)
|
||||
|
||||
case "previous year":
|
||||
start = datetime(local_now.year - 1, 1, 1, 0, 0, 0, tzinfo=tz)
|
||||
end = datetime(local_now.year - 1, 12, 31, 23, 59, 59, tzinfo=tz)
|
||||
|
||||
# Convert to UTC and format
|
||||
start_str = start.astimezone(UTC).strftime("%Y%m%d%H%M%S")
|
||||
end_str = end.astimezone(UTC).strftime("%Y%m%d%H%M%S")
|
||||
return f"{field}:[{start_str} TO {end_str}]"
|
||||
|
||||
return re.sub(pattern, repl, query_string, flags=re.IGNORECASE)
|
||||
@@ -1,3 +1,4 @@
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
@@ -14,10 +15,20 @@ class Command(PaperlessCommand):
|
||||
def add_arguments(self, parser):
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("command", choices=["reindex", "optimize"])
|
||||
parser.add_argument(
|
||||
"--recreate",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Wipe and recreate the index from scratch (only used with reindex).",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
with transaction.atomic():
|
||||
if options["command"] == "reindex":
|
||||
if options.get("recreate"):
|
||||
from documents.search import wipe_index
|
||||
|
||||
wipe_index(settings.INDEX_DIR)
|
||||
index_reindex(
|
||||
iter_wrapper=lambda docs: self.track(
|
||||
docs,
|
||||
|
||||
19
src/documents/search/__init__.py
Normal file
19
src/documents/search/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from documents.search._backend import SearchIndexLockError
|
||||
from documents.search._backend import SearchResults
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import TantivyRelevanceList
|
||||
from documents.search._backend import WriteBatch
|
||||
from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
from documents.search._schema import wipe_index
|
||||
|
||||
__all__ = [
|
||||
"SearchIndexLockError",
|
||||
"SearchResults",
|
||||
"TantivyBackend",
|
||||
"TantivyRelevanceList",
|
||||
"WriteBatch",
|
||||
"get_backend",
|
||||
"reset_backend",
|
||||
"wipe_index",
|
||||
]
|
||||
733
src/documents/search/_backend.py
Normal file
733
src/documents/search/_backend.py
Normal file
@@ -0,0 +1,733 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
from typing import TypedDict
|
||||
from typing import TypeVar
|
||||
|
||||
import filelock
|
||||
import regex
|
||||
import tantivy
|
||||
from django.conf import settings
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import parse_user_query
|
||||
from documents.search._schema import _write_sentinels
|
||||
from documents.search._schema import build_schema
|
||||
from documents.search._schema import open_or_rebuild_index
|
||||
from documents.search._schema import wipe_index
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from django.contrib.auth.base_user import AbstractBaseUser
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
_WORD_RE = regex.compile(r"\w+")
|
||||
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[T]) -> Iterable[T]:
|
||||
"""Default iter_wrapper that passes through unchanged."""
|
||||
return iterable
|
||||
|
||||
|
||||
def _ascii_fold(s: str) -> str:
|
||||
"""Normalize unicode to ASCII equivalent characters."""
|
||||
return unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode()
|
||||
|
||||
|
||||
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
|
||||
"""Extract and normalize words for autocomplete.
|
||||
|
||||
Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases,
|
||||
and ascii-folds each token. Uses the regex library with a timeout to guard against
|
||||
ReDoS on untrusted document content.
|
||||
"""
|
||||
words = set()
|
||||
for text in text_sources:
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT)
|
||||
except regex.TimeoutError:
|
||||
logger.warning(
|
||||
"Autocomplete word extraction timed out for a text source; skipping.",
|
||||
)
|
||||
continue
|
||||
for token in tokens:
|
||||
normalized = _ascii_fold(token.lower())
|
||||
if normalized:
|
||||
words.add(normalized)
|
||||
return words
|
||||
|
||||
|
||||
class SearchHit(TypedDict):
|
||||
"""Type definition for search result hits."""
|
||||
|
||||
id: int
|
||||
score: float
|
||||
rank: int
|
||||
highlights: dict[str, str]
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SearchResults:
|
||||
hits: list[SearchHit]
|
||||
total: int # total matching documents (for pagination)
|
||||
query: str # preprocessed query string
|
||||
|
||||
|
||||
class TantivyRelevanceList:
|
||||
"""DRF-compatible list wrapper for Tantivy search hits.
|
||||
|
||||
__len__ returns the total hit count (for pagination); __getitem__ slices
|
||||
the hit list. Stores ALL post-filter hits so that get_all_result_ids()
|
||||
can return every matching doc ID without a second query.
|
||||
"""
|
||||
|
||||
def __init__(self, hits: list[SearchHit]) -> None:
|
||||
self._hits = hits
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._hits)
|
||||
|
||||
def __getitem__(self, key: slice) -> list[SearchHit]:
|
||||
return self._hits[key]
|
||||
|
||||
|
||||
class SearchIndexLockError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class WriteBatch:
|
||||
"""Context manager for bulk index operations with file locking."""
|
||||
|
||||
def __init__(self, backend: TantivyBackend, lock_timeout: float):
|
||||
self._backend = backend
|
||||
self._lock_timeout = lock_timeout
|
||||
self._writer = None
|
||||
self._lock = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
if self._backend._path is not None:
|
||||
lock_path = self._backend._path / ".tantivy.lock"
|
||||
self._lock = filelock.FileLock(str(lock_path))
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout)
|
||||
except filelock.Timeout as e:
|
||||
raise SearchIndexLockError(
|
||||
f"Could not acquire index lock within {self._lock_timeout}s",
|
||||
) from e
|
||||
|
||||
self._writer = self._backend._index.writer()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
try:
|
||||
if exc_type is None:
|
||||
# Success case - commit changes
|
||||
self._writer.commit()
|
||||
self._backend._index.reload()
|
||||
else:
|
||||
# Exception occurred - discard changes
|
||||
# Writer is automatically discarded when it goes out of scope
|
||||
pass
|
||||
# Explicitly delete writer to release tantivy's internal lock
|
||||
if self._writer is not None:
|
||||
del self._writer
|
||||
self._writer = None
|
||||
finally:
|
||||
if self._lock is not None:
|
||||
self._lock.release()
|
||||
|
||||
def add_or_update(
|
||||
self,
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
"""Add or update a document in the batch.
|
||||
|
||||
Tantivy has no native upsert — we delete by id then re-add so
|
||||
stale copies (e.g. after a permission change) don't linger.
|
||||
``effective_content`` overrides ``document.content`` for indexing.
|
||||
"""
|
||||
self.remove(document.pk)
|
||||
doc = self._backend._build_tantivy_doc(document, effective_content)
|
||||
self._writer.add_document(doc)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""Remove a document from the batch."""
|
||||
# Use range query to work around u64 deletion bug
|
||||
self._writer.delete_documents_by_query(
|
||||
tantivy.Query.range_query(
|
||||
self._backend._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class TantivyBackend:
|
||||
"""Tantivy search backend with explicit lifecycle management."""
|
||||
|
||||
def __init__(self, path: Path | None = None):
|
||||
# path=None → in-memory index (for tests)
|
||||
# path=some_dir → on-disk index (for production)
|
||||
self._path = path
|
||||
self._index = None
|
||||
self._schema = None
|
||||
|
||||
def open(self) -> None:
|
||||
"""Open or rebuild the index. Idempotent."""
|
||||
if self._index is not None:
|
||||
return
|
||||
if self._path is not None:
|
||||
self._index = open_or_rebuild_index(self._path)
|
||||
else:
|
||||
self._index = tantivy.Index(build_schema())
|
||||
register_tokenizers(self._index, settings.SEARCH_LANGUAGE)
|
||||
self._schema = self._index.schema
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the index. Idempotent."""
|
||||
self._index = None
|
||||
self._schema = None
|
||||
|
||||
def _ensure_open(self) -> None:
|
||||
"""Ensure the index is open before operations."""
|
||||
if self._index is None:
|
||||
self.open()
|
||||
|
||||
def _build_tantivy_doc(
|
||||
self,
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> tantivy.Document:
|
||||
"""Build a tantivy Document from a Django Document instance.
|
||||
|
||||
``effective_content`` overrides ``document.content`` for indexing —
|
||||
used when re-indexing a root document with a newer version's OCR text.
|
||||
"""
|
||||
content = (
|
||||
effective_content if effective_content is not None else document.content
|
||||
)
|
||||
|
||||
doc = tantivy.Document()
|
||||
|
||||
# Basic fields
|
||||
doc.add_unsigned("id", document.pk)
|
||||
doc.add_text("checksum", document.checksum)
|
||||
doc.add_text("title", document.title)
|
||||
doc.add_text("title_sort", document.title)
|
||||
doc.add_text("content", content)
|
||||
doc.add_text("bigram_content", content)
|
||||
|
||||
# Original filename - only add if not None/empty
|
||||
if document.original_filename:
|
||||
doc.add_text("original_filename", document.original_filename)
|
||||
|
||||
# Correspondent
|
||||
if document.correspondent:
|
||||
doc.add_text("correspondent", document.correspondent.name)
|
||||
doc.add_text("correspondent_sort", document.correspondent.name)
|
||||
doc.add_unsigned("correspondent_id", document.correspondent_id)
|
||||
|
||||
# Document type
|
||||
if document.document_type:
|
||||
doc.add_text("document_type", document.document_type.name)
|
||||
doc.add_text("type_sort", document.document_type.name)
|
||||
doc.add_unsigned("document_type_id", document.document_type_id)
|
||||
|
||||
# Storage path
|
||||
if document.storage_path:
|
||||
doc.add_text("storage_path", document.storage_path.name)
|
||||
doc.add_unsigned("storage_path_id", document.storage_path_id)
|
||||
|
||||
# Tags
|
||||
for tag in document.tags.all():
|
||||
doc.add_text("tag", tag.name)
|
||||
doc.add_unsigned("tag_id", tag.pk)
|
||||
|
||||
# Notes — JSON for structured queries (notes.user:alice, notes.note:text),
|
||||
# companion text field for default full-text search.
|
||||
for note in document.notes.all():
|
||||
note_data: dict[str, str] = {"note": note.note}
|
||||
if note.user:
|
||||
note_data["user"] = note.user.username
|
||||
doc.add_json("notes", note_data)
|
||||
doc.add_text("note", note.note)
|
||||
|
||||
# Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
|
||||
# companion text field for default full-text search.
|
||||
for cfi in document.custom_fields.all():
|
||||
doc.add_json(
|
||||
"custom_fields",
|
||||
{
|
||||
"name": cfi.field.name,
|
||||
"value": str(cfi.value),
|
||||
},
|
||||
)
|
||||
doc.add_text("custom_field", str(cfi.value))
|
||||
|
||||
# Dates - created is date-only, others are full datetime
|
||||
created_date = datetime(
|
||||
document.created.year,
|
||||
document.created.month,
|
||||
document.created.day,
|
||||
tzinfo=UTC,
|
||||
)
|
||||
doc.add_date("created", created_date)
|
||||
doc.add_date("modified", document.modified)
|
||||
doc.add_date("added", document.added)
|
||||
|
||||
# ASN - skip entirely when None (0 is valid)
|
||||
if document.archive_serial_number is not None:
|
||||
doc.add_unsigned("asn", document.archive_serial_number)
|
||||
|
||||
# Page count - only add if not None
|
||||
if document.page_count is not None:
|
||||
doc.add_unsigned("page_count", document.page_count)
|
||||
|
||||
# Number of notes
|
||||
doc.add_unsigned("num_notes", document.notes.count())
|
||||
|
||||
# Owner
|
||||
if document.owner_id:
|
||||
doc.add_unsigned("owner_id", document.owner_id)
|
||||
|
||||
# Viewers with permission
|
||||
users_with_perms = get_users_with_perms(
|
||||
document,
|
||||
only_with_perms_in=["view_document"],
|
||||
)
|
||||
for user in users_with_perms:
|
||||
doc.add_unsigned("viewer_id", user.pk)
|
||||
|
||||
# Autocomplete words with NLTK stopword filtering
|
||||
text_sources = [document.title, content]
|
||||
if document.correspondent:
|
||||
text_sources.append(document.correspondent.name)
|
||||
if document.document_type:
|
||||
text_sources.append(document.document_type.name)
|
||||
for tag in document.tags.all():
|
||||
text_sources.append(tag.name)
|
||||
|
||||
autocomplete_words = _extract_autocomplete_words(text_sources)
|
||||
|
||||
# Add sorted deduplicated words
|
||||
for word in sorted(autocomplete_words):
|
||||
doc.add_text("autocomplete_word", word)
|
||||
|
||||
return doc
|
||||
|
||||
def add_or_update(
|
||||
self,
|
||||
document: Document,
|
||||
effective_content: str | None = None,
|
||||
) -> None:
|
||||
"""Add or update a single document with file locking."""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.add_or_update(document, effective_content)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""Remove a single document with file locking."""
|
||||
self._ensure_open()
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.remove(doc_id)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
user: AbstractBaseUser | None,
|
||||
page: int,
|
||||
page_size: int,
|
||||
sort_field: str | None,
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
) -> SearchResults:
|
||||
"""Search the index."""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
# Apply permission filter if user is not None (not superuser)
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, user_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = user_query
|
||||
|
||||
searcher = self._index.searcher()
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
# Map sort fields
|
||||
sort_field_map = {
|
||||
"title": "title_sort",
|
||||
"correspondent__name": "correspondent_sort",
|
||||
"document_type__name": "type_sort",
|
||||
"created": "created",
|
||||
"added": "added",
|
||||
"modified": "modified",
|
||||
"archive_serial_number": "asn",
|
||||
"page_count": "page_count",
|
||||
"num_notes": "num_notes",
|
||||
}
|
||||
|
||||
# Perform search
|
||||
if sort_field and sort_field in sort_field_map:
|
||||
mapped_field = sort_field_map[sort_field]
|
||||
if sort_reverse:
|
||||
# For reverse sort, we need to use a different approach
|
||||
# tantivy doesn't directly support reverse field sorting in the Python API
|
||||
# We'll search for more results and sort in Python
|
||||
results = searcher.search(final_query, limit=offset + page_size * 10)
|
||||
# For field sorting: just DocAddress (no score)
|
||||
all_hits = [
|
||||
(hit, 0.0) for hit in results.hits
|
||||
] # score is 0 for field sorts
|
||||
else:
|
||||
results = searcher.search(
|
||||
final_query,
|
||||
limit=offset + page_size,
|
||||
order_by_field=mapped_field,
|
||||
)
|
||||
# For field sorting: just DocAddress (no score)
|
||||
all_hits = [
|
||||
(hit, 0.0) for hit in results.hits
|
||||
] # score is 0 for field sorts
|
||||
else:
|
||||
# Score-based search returns: (score, doc_address) tuple
|
||||
results = searcher.search(final_query, limit=offset + page_size)
|
||||
# Convert to (doc_address, score) for consistency
|
||||
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
||||
|
||||
total = results.count
|
||||
|
||||
# Normalize scores for score-based searches
|
||||
if not sort_field and all_hits:
|
||||
scores = [hit[1] for hit in all_hits]
|
||||
max_score = max(scores) if scores else 1.0
|
||||
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
||||
|
||||
# Apply threshold filter if configured
|
||||
threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
|
||||
if (
|
||||
threshold is not None and not sort_field
|
||||
): # Only apply threshold to score-based search
|
||||
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
|
||||
|
||||
# Get the page's hits
|
||||
page_hits = all_hits[offset : offset + page_size]
|
||||
|
||||
# Build result hits with highlights
|
||||
hits: list[SearchHit] = []
|
||||
snippet_generator = None
|
||||
|
||||
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
||||
# Get the actual document from the searcher using the doc address
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
doc_dict = actual_doc.to_dict()
|
||||
doc_id = doc_dict["id"][0]
|
||||
|
||||
highlights: dict[str, str] = {}
|
||||
|
||||
# Generate highlights if score > 0
|
||||
if score > 0:
|
||||
try:
|
||||
if snippet_generator is None:
|
||||
snippet_generator = tantivy.SnippetGenerator.create(
|
||||
searcher,
|
||||
final_query,
|
||||
self._schema,
|
||||
"content",
|
||||
)
|
||||
|
||||
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
|
||||
if content_snippet:
|
||||
highlights["content"] = str(content_snippet)
|
||||
|
||||
# Try notes highlights
|
||||
if "notes" in doc_dict:
|
||||
notes_generator = tantivy.SnippetGenerator.create(
|
||||
searcher,
|
||||
final_query,
|
||||
self._schema,
|
||||
"notes",
|
||||
)
|
||||
notes_snippet = notes_generator.snippet_from_doc(actual_doc)
|
||||
if notes_snippet:
|
||||
highlights["notes"] = str(notes_snippet)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to generate highlights for doc {doc_id}: {e}")
|
||||
|
||||
hits.append(
|
||||
SearchHit(
|
||||
id=doc_id,
|
||||
score=score,
|
||||
rank=rank,
|
||||
highlights=highlights,
|
||||
),
|
||||
)
|
||||
|
||||
return SearchResults(
|
||||
hits=hits,
|
||||
total=total,
|
||||
query=query,
|
||||
)
|
||||
|
||||
def autocomplete(
|
||||
self,
|
||||
term: str,
|
||||
limit: int,
|
||||
user: AbstractBaseUser | None = None,
|
||||
) -> list[str]:
|
||||
"""Get autocomplete suggestions, optionally filtered by user visibility."""
|
||||
self._ensure_open()
|
||||
normalized_term = _ascii_fold(term.lower())
|
||||
|
||||
searcher = self._index.searcher()
|
||||
|
||||
# Apply permission filter for non-superusers so autocomplete words
|
||||
# from invisible documents don't leak to other users.
|
||||
if user is not None and not user.is_superuser:
|
||||
base_query = build_permission_filter(self._schema, user)
|
||||
else:
|
||||
base_query = tantivy.Query.all_query()
|
||||
|
||||
results = searcher.search(base_query, limit=10000)
|
||||
|
||||
# Count how many visible documents each word appears in.
|
||||
# Using Counter (not set) preserves per-word document frequency so
|
||||
# we can rank suggestions by how commonly they occur — the same
|
||||
# signal Whoosh used for Tf/Idf-based autocomplete ordering.
|
||||
word_counts: Counter[str] = Counter()
|
||||
for hit in results.hits:
|
||||
# hits are (score, doc_address) tuples
|
||||
doc_address = hit[1] if len(hit) == 2 else hit[0]
|
||||
|
||||
stored_doc = searcher.doc(doc_address)
|
||||
doc_dict = stored_doc.to_dict()
|
||||
if "autocomplete_word" in doc_dict:
|
||||
word_counts.update(doc_dict["autocomplete_word"])
|
||||
|
||||
# Filter to prefix matches, then sort by document frequency descending
|
||||
# so the most-used matching word comes first.
|
||||
matches = sorted(
|
||||
(w for w in word_counts if w.startswith(normalized_term)),
|
||||
key=lambda w: -word_counts[w],
|
||||
)
|
||||
|
||||
return matches[:limit]
|
||||
|
||||
def more_like_this(
|
||||
self,
|
||||
doc_id: int,
|
||||
user: AbstractBaseUser | None,
|
||||
page: int,
|
||||
page_size: int,
|
||||
) -> SearchResults:
|
||||
"""Find documents similar to the given document."""
|
||||
self._ensure_open()
|
||||
searcher = self._index.searcher()
|
||||
|
||||
# First find the document address
|
||||
id_query = tantivy.Query.range_query(
|
||||
self._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc_id,
|
||||
doc_id,
|
||||
)
|
||||
results = searcher.search(id_query, limit=1)
|
||||
|
||||
if not results.hits:
|
||||
# Document not found
|
||||
return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
|
||||
|
||||
# Extract doc_address from (score, doc_address) tuple
|
||||
doc_address = results.hits[0][1]
|
||||
|
||||
# Build more like this query
|
||||
mlt_query = tantivy.Query.more_like_this_query(
|
||||
doc_address,
|
||||
min_doc_frequency=1,
|
||||
max_doc_frequency=None,
|
||||
min_term_frequency=1,
|
||||
max_query_terms=12,
|
||||
min_word_length=None,
|
||||
max_word_length=None,
|
||||
boost_factor=None,
|
||||
)
|
||||
|
||||
# Apply permission filter
|
||||
if user is not None:
|
||||
permission_filter = build_permission_filter(self._schema, user)
|
||||
final_query = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, mlt_query),
|
||||
(tantivy.Occur.Must, permission_filter),
|
||||
],
|
||||
)
|
||||
else:
|
||||
final_query = mlt_query
|
||||
|
||||
# Search
|
||||
offset = (page - 1) * page_size
|
||||
results = searcher.search(final_query, limit=offset + page_size)
|
||||
|
||||
total = results.count
|
||||
# Convert from (score, doc_address) to (doc_address, score)
|
||||
all_hits = [(hit[1], hit[0]) for hit in results.hits]
|
||||
|
||||
# Normalize scores
|
||||
if all_hits:
|
||||
max_score = max(hit[1] for hit in all_hits) or 1.0
|
||||
all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
|
||||
|
||||
# Get page hits
|
||||
page_hits = all_hits[offset : offset + page_size]
|
||||
|
||||
# Build results
|
||||
hits: list[SearchHit] = []
|
||||
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
doc_dict = actual_doc.to_dict()
|
||||
result_doc_id = doc_dict["id"][0]
|
||||
|
||||
# Skip the original document
|
||||
if result_doc_id == doc_id:
|
||||
continue
|
||||
|
||||
hits.append(
|
||||
SearchHit(
|
||||
id=result_doc_id,
|
||||
score=score,
|
||||
rank=rank,
|
||||
highlights={}, # MLT doesn't generate highlights
|
||||
),
|
||||
)
|
||||
|
||||
return SearchResults(
|
||||
hits=hits,
|
||||
total=max(0, total - 1), # Subtract 1 for the original document
|
||||
query=f"more_like:{doc_id}",
|
||||
)
|
||||
|
||||
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
|
||||
"""Get a batch context manager for bulk operations."""
|
||||
self._ensure_open()
|
||||
return WriteBatch(self, lock_timeout)
|
||||
|
||||
def rebuild(self, documents: QuerySet, iter_wrapper: Callable = _identity) -> None:
|
||||
"""Rebuild the entire search index."""
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
# Create new index (on-disk or in-memory)
|
||||
if self._path is not None:
|
||||
wipe_index(self._path)
|
||||
new_index = tantivy.Index(build_schema(), path=str(self._path))
|
||||
_write_sentinels(self._path)
|
||||
else:
|
||||
new_index = tantivy.Index(build_schema())
|
||||
register_tokenizers(new_index, settings.SEARCH_LANGUAGE)
|
||||
|
||||
# Index all documents using the new index
|
||||
writer = new_index.writer()
|
||||
|
||||
for document in iter_wrapper(documents):
|
||||
# Temporarily use new index for document building
|
||||
old_index = self._index
|
||||
old_schema = self._schema
|
||||
self._index = new_index
|
||||
self._schema = new_index.schema
|
||||
|
||||
try:
|
||||
doc = self._build_tantivy_doc(document)
|
||||
writer.add_document(doc)
|
||||
finally:
|
||||
# Restore old index
|
||||
self._index = old_index
|
||||
self._schema = old_schema
|
||||
|
||||
writer.commit()
|
||||
|
||||
# Swap to new index
|
||||
self._index = new_index
|
||||
self._schema = new_index.schema
|
||||
self._index.reload()
|
||||
|
||||
|
||||
# Module-level singleton with proper thread safety
|
||||
_backend: TantivyBackend | None = None
|
||||
_backend_path: Path | None = None # tracks which INDEX_DIR the singleton uses
|
||||
_backend_lock = threading.RLock()
|
||||
|
||||
|
||||
def get_backend() -> TantivyBackend:
|
||||
"""Get the global backend instance with thread safety.
|
||||
|
||||
Automatically reinitializes when settings.INDEX_DIR changes — this fixes
|
||||
the xdist/override_settings isolation issue where each test may set a
|
||||
different INDEX_DIR but would otherwise share a stale singleton.
|
||||
"""
|
||||
global _backend, _backend_path
|
||||
|
||||
current_path: Path = settings.INDEX_DIR
|
||||
|
||||
# Fast path: backend is initialized and path hasn't changed (no lock needed)
|
||||
if _backend is not None and _backend_path == current_path:
|
||||
return _backend
|
||||
|
||||
# Slow path: first call, or INDEX_DIR changed between calls
|
||||
with _backend_lock:
|
||||
# Double-check after acquiring lock — another thread may have beaten us
|
||||
if _backend is not None and _backend_path == current_path:
|
||||
return _backend
|
||||
|
||||
if _backend is not None:
|
||||
_backend.close()
|
||||
|
||||
_backend = TantivyBackend(path=current_path)
|
||||
_backend.open()
|
||||
_backend_path = current_path
|
||||
|
||||
return _backend
|
||||
|
||||
|
||||
def reset_backend() -> None:
|
||||
"""Reset the global backend instance with thread safety."""
|
||||
global _backend, _backend_path
|
||||
|
||||
with _backend_lock:
|
||||
if _backend is not None:
|
||||
_backend.close()
|
||||
_backend = None
|
||||
_backend_path = None
|
||||
417
src/documents/search/_query.py
Normal file
417
src/documents/search/_query.py
Normal file
@@ -0,0 +1,417 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import UTC
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tantivy
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from django.conf import settings
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
|
||||
from django.contrib.auth.base_user import AbstractBaseUser
|
||||
|
||||
_DATE_ONLY_FIELDS = frozenset({"created"})
|
||||
|
||||
_DATE_KEYWORDS = frozenset(
|
||||
{
|
||||
"today",
|
||||
"yesterday",
|
||||
"this_week",
|
||||
"last_week",
|
||||
"this_month",
|
||||
"last_month",
|
||||
"this_year",
|
||||
"last_year",
|
||||
},
|
||||
)
|
||||
|
||||
_FIELD_DATE_RE = re.compile(
|
||||
r"(\w+):(" + "|".join(_DATE_KEYWORDS) + r")\b",
|
||||
)
|
||||
_COMPACT_DATE_RE = re.compile(r"\b(\d{14})\b")
|
||||
_RELATIVE_RANGE_RE = re.compile(
|
||||
r"\[now([+-]\d+[dhm])?\s+TO\s+now([+-]\d+[dhm])?\]",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Whoosh-style relative date range: e.g. [-1 week to now], [-7 days to now]
|
||||
_WHOOSH_REL_RANGE_RE = re.compile(
|
||||
r"\[-(?P<n>\d+)\s+(?P<unit>second|minute|hour|day|week|month|year)s?\s+to\s+now\]",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
|
||||
_DATE8_RE = re.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
"""Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
|
||||
return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _iso_range(lo: datetime, hi: datetime) -> str:
|
||||
"""Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
|
||||
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
|
||||
def _date_only_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `created` (DateField): use the local calendar date, converted to
|
||||
midnight UTC boundaries. No offset arithmetic — date only.
|
||||
"""
|
||||
|
||||
today = datetime.now(tz).date()
|
||||
|
||||
if keyword == "today":
|
||||
lo = datetime(today.year, today.month, today.day, tzinfo=UTC)
|
||||
return _iso_range(lo, lo + timedelta(days=1))
|
||||
if keyword == "yesterday":
|
||||
y = today - timedelta(days=1)
|
||||
lo = datetime(y.year, y.month, y.day, tzinfo=UTC)
|
||||
hi = datetime(today.year, today.month, today.day, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == "this_week":
|
||||
mon = today - timedelta(days=today.weekday())
|
||||
lo = datetime(mon.year, mon.month, mon.day, tzinfo=UTC)
|
||||
return _iso_range(lo, lo + timedelta(weeks=1))
|
||||
if keyword == "last_week":
|
||||
this_mon = today - timedelta(days=today.weekday())
|
||||
last_mon = this_mon - timedelta(weeks=1)
|
||||
lo = datetime(last_mon.year, last_mon.month, last_mon.day, tzinfo=UTC)
|
||||
hi = datetime(this_mon.year, this_mon.month, this_mon.day, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == "this_month":
|
||||
lo = datetime(today.year, today.month, 1, tzinfo=UTC)
|
||||
if today.month == 12:
|
||||
hi = datetime(today.year + 1, 1, 1, tzinfo=UTC)
|
||||
else:
|
||||
hi = datetime(today.year, today.month + 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == "last_month":
|
||||
if today.month == 1:
|
||||
lo = datetime(today.year - 1, 12, 1, tzinfo=UTC)
|
||||
else:
|
||||
lo = datetime(today.year, today.month - 1, 1, tzinfo=UTC)
|
||||
hi = datetime(today.year, today.month, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == "this_year":
|
||||
lo = datetime(today.year, 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, datetime(today.year + 1, 1, 1, tzinfo=UTC))
|
||||
if keyword == "last_year":
|
||||
lo = datetime(today.year - 1, 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, datetime(today.year, 1, 1, tzinfo=UTC))
|
||||
raise ValueError(f"Unknown keyword: {keyword}")
|
||||
|
||||
|
||||
def _datetime_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `added` / `modified` (DateTimeField, stored as UTC): convert local day
|
||||
boundaries to UTC — full offset arithmetic required.
|
||||
"""
|
||||
|
||||
now_local = datetime.now(tz)
|
||||
today = now_local.date()
|
||||
|
||||
def _midnight(d: date) -> datetime:
|
||||
return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC)
|
||||
|
||||
if keyword == "today":
|
||||
return _iso_range(_midnight(today), _midnight(today + timedelta(days=1)))
|
||||
if keyword == "yesterday":
|
||||
y = today - timedelta(days=1)
|
||||
return _iso_range(_midnight(y), _midnight(today))
|
||||
if keyword == "this_week":
|
||||
mon = today - timedelta(days=today.weekday())
|
||||
return _iso_range(_midnight(mon), _midnight(mon + timedelta(weeks=1)))
|
||||
if keyword == "last_week":
|
||||
this_mon = today - timedelta(days=today.weekday())
|
||||
last_mon = this_mon - timedelta(weeks=1)
|
||||
return _iso_range(_midnight(last_mon), _midnight(this_mon))
|
||||
if keyword == "this_month":
|
||||
first = today.replace(day=1)
|
||||
if today.month == 12:
|
||||
next_first = date(today.year + 1, 1, 1)
|
||||
else:
|
||||
next_first = date(today.year, today.month + 1, 1)
|
||||
return _iso_range(_midnight(first), _midnight(next_first))
|
||||
if keyword == "last_month":
|
||||
this_first = today.replace(day=1)
|
||||
if today.month == 1:
|
||||
last_first = date(today.year - 1, 12, 1)
|
||||
else:
|
||||
last_first = date(today.year, today.month - 1, 1)
|
||||
return _iso_range(_midnight(last_first), _midnight(this_first))
|
||||
if keyword == "this_year":
|
||||
return _iso_range(
|
||||
_midnight(date(today.year, 1, 1)),
|
||||
_midnight(date(today.year + 1, 1, 1)),
|
||||
)
|
||||
if keyword == "last_year":
|
||||
return _iso_range(
|
||||
_midnight(date(today.year - 1, 1, 1)),
|
||||
_midnight(date(today.year, 1, 1)),
|
||||
)
|
||||
raise ValueError(f"Unknown keyword: {keyword}")
|
||||
|
||||
|
||||
def _rewrite_compact_date(query: str) -> str:
|
||||
"""Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601."""
|
||||
|
||||
def _sub(m: re.Match[str]) -> str:
|
||||
raw = m.group(1)
|
||||
try:
|
||||
dt = datetime(
|
||||
int(raw[0:4]),
|
||||
int(raw[4:6]),
|
||||
int(raw[6:8]),
|
||||
int(raw[8:10]),
|
||||
int(raw[10:12]),
|
||||
int(raw[12:14]),
|
||||
tzinfo=UTC,
|
||||
)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except ValueError:
|
||||
return str(m.group(0))
|
||||
|
||||
return _COMPACT_DATE_RE.sub(_sub, query)
|
||||
|
||||
|
||||
def _rewrite_relative_range(query: str) -> str:
|
||||
"""Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries."""
|
||||
|
||||
def _sub(m: re.Match[str]) -> str:
|
||||
now = datetime.now(UTC)
|
||||
|
||||
def _offset(s: str | None) -> timedelta:
|
||||
if not s:
|
||||
return timedelta(0)
|
||||
sign = 1 if s[0] == "+" else -1
|
||||
n, unit = int(s[1:-1]), s[-1]
|
||||
return (
|
||||
sign
|
||||
* {
|
||||
"d": timedelta(days=n),
|
||||
"h": timedelta(hours=n),
|
||||
"m": timedelta(minutes=n),
|
||||
}[unit]
|
||||
)
|
||||
|
||||
lo, hi = now + _offset(m.group(1)), now + _offset(m.group(2))
|
||||
if lo > hi:
|
||||
lo, hi = hi, lo
|
||||
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
return _RELATIVE_RANGE_RE.sub(_sub, query)
|
||||
|
||||
|
||||
def _rewrite_whoosh_relative_range(query: str) -> str:
|
||||
"""Rewrite Whoosh-style relative date ranges ([-N unit to now]) to ISO 8601.
|
||||
|
||||
Supports: second, minute, hour, day, week, month, year (singular and plural).
|
||||
Example: ``added:[-1 week to now]`` → ``added:[2025-01-01T… TO 2025-01-08T…]``
|
||||
"""
|
||||
now = datetime.now(UTC)
|
||||
|
||||
def _sub(m: re.Match[str]) -> str:
|
||||
n = int(m.group("n"))
|
||||
unit = m.group("unit").lower()
|
||||
delta_map: dict[str, timedelta | relativedelta] = {
|
||||
"second": timedelta(seconds=n),
|
||||
"minute": timedelta(minutes=n),
|
||||
"hour": timedelta(hours=n),
|
||||
"day": timedelta(days=n),
|
||||
"week": timedelta(weeks=n),
|
||||
"month": relativedelta(months=n),
|
||||
"year": relativedelta(years=n),
|
||||
}
|
||||
lo = now - delta_map[unit]
|
||||
return f"[{_fmt(lo)} TO {_fmt(now)}]"
|
||||
|
||||
return _WHOOSH_REL_RANGE_RE.sub(_sub, query)
|
||||
|
||||
|
||||
def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
|
||||
"""Rewrite field:YYYYMMDD date tokens to an ISO 8601 day range.
|
||||
|
||||
Runs after ``_rewrite_compact_date`` so 14-digit timestamps are already
|
||||
converted and won't spuriously match here.
|
||||
|
||||
For DateField fields (e.g. ``created``) uses UTC midnight boundaries.
|
||||
For DateTimeField fields (e.g. ``added``, ``modified``) uses local TZ
|
||||
midnight boundaries converted to UTC — matching the ``_datetime_range``
|
||||
behaviour for keyword dates.
|
||||
"""
|
||||
|
||||
def _sub(m: re.Match[str]) -> str:
|
||||
field = m.group("field")
|
||||
raw = m.group("date8")
|
||||
try:
|
||||
year, month, day = int(raw[0:4]), int(raw[4:6]), int(raw[6:8])
|
||||
d = date(year, month, day)
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
lo = datetime(d.year, d.month, d.day, tzinfo=UTC)
|
||||
hi = lo + timedelta(days=1)
|
||||
else:
|
||||
# DateTimeField: use local-timezone midnight → UTC
|
||||
lo = datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC)
|
||||
hi = datetime(
|
||||
(d + timedelta(days=1)).year,
|
||||
(d + timedelta(days=1)).month,
|
||||
(d + timedelta(days=1)).day,
|
||||
tzinfo=tz,
|
||||
).astimezone(UTC)
|
||||
return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
except ValueError:
|
||||
return m.group(0)
|
||||
|
||||
return _DATE8_RE.sub(_sub, query)
|
||||
|
||||
|
||||
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
Preprocessing stage 1: rewrite Whoosh compact dates, relative ranges,
|
||||
and natural date keywords (field:today etc.) to ISO 8601.
|
||||
Bare keywords without a field: prefix pass through unchanged.
|
||||
"""
|
||||
query = _rewrite_compact_date(query)
|
||||
query = _rewrite_whoosh_relative_range(query)
|
||||
query = _rewrite_8digit_date(query, tz)
|
||||
query = _rewrite_relative_range(query)
|
||||
|
||||
def _replace(m: re.Match[str]) -> str:
|
||||
field, keyword = m.group(1), m.group(2)
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
return f"{field}:{_date_only_range(keyword, tz)}"
|
||||
return f"{field}:{_datetime_range(keyword, tz)}"
|
||||
|
||||
return _FIELD_DATE_RE.sub(_replace, query)
|
||||
|
||||
|
||||
def normalize_query(query: str) -> str:
|
||||
"""
|
||||
Join comma-separated field values with AND, collapse whitespace.
|
||||
tag:foo,bar → tag:foo AND tag:bar
|
||||
"""
|
||||
|
||||
def _expand(m: re.Match[str]) -> str:
|
||||
field = m.group(1)
|
||||
values = [v.strip() for v in m.group(2).split(",") if v.strip()]
|
||||
return " AND ".join(f"{field}:{v}" for v in values)
|
||||
|
||||
query = re.sub(r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", _expand, query)
|
||||
return re.sub(r" {2,}", " ", query).strip()
|
||||
|
||||
|
||||
_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range
|
||||
|
||||
|
||||
def build_permission_filter(
|
||||
schema: tantivy.Schema,
|
||||
user: AbstractBaseUser,
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Returns a Query matching documents visible to user:
|
||||
- no owner (public) → owner_id field absent (NULL in Django)
|
||||
- owned by user → owner_id = user.pk
|
||||
- shared with user → viewer_id = user.pk
|
||||
|
||||
Uses disjunction_max_query — boolean Should-only would match all docs.
|
||||
|
||||
NOTE: all integer queries use range_query, not term_query, to avoid the
|
||||
unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
|
||||
before u64; confirmed empirically — term_query returns 0 for u64 fields).
|
||||
Same root cause as issue #47 (from_dict) but the term_query path unfixed.
|
||||
See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
|
||||
https://github.com/quickwit-oss/tantivy-py/issues/47
|
||||
|
||||
NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
|
||||
exists_query is not available in 0.25.1. It is present in master and can
|
||||
simplify this to MustNot(exists_query("owner_id")) once released.
|
||||
See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
|
||||
"""
|
||||
owner_any = tantivy.Query.range_query(
|
||||
schema,
|
||||
"owner_id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
1,
|
||||
_MAX_U64,
|
||||
)
|
||||
no_owner = tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Must, tantivy.Query.all_query()),
|
||||
(tantivy.Occur.MustNot, owner_any),
|
||||
],
|
||||
)
|
||||
owned = tantivy.Query.range_query(
|
||||
schema,
|
||||
"owner_id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
user.pk,
|
||||
user.pk,
|
||||
)
|
||||
shared = tantivy.Query.range_query(
|
||||
schema,
|
||||
"viewer_id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
user.pk,
|
||||
user.pk,
|
||||
)
|
||||
return tantivy.Query.disjunction_max_query([no_owner, owned, shared])
|
||||
|
||||
|
||||
DEFAULT_SEARCH_FIELDS = [
|
||||
"title",
|
||||
"content",
|
||||
"correspondent",
|
||||
"document_type",
|
||||
"tag",
|
||||
"note", # companion text field for notes content (notes JSON for structured: notes.user:x)
|
||||
"custom_field", # companion text field for CF values (custom_fields JSON for structured: custom_fields.name:x)
|
||||
]
|
||||
_FIELD_BOOSTS = {"title": 2.0}
|
||||
|
||||
|
||||
def parse_user_query(
|
||||
index: tantivy.Index,
|
||||
raw_query: str,
|
||||
tz: tzinfo,
|
||||
) -> tantivy.Query:
|
||||
"""Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse.
|
||||
|
||||
When ADVANCED_FUZZY_SEARCH_THRESHOLD is set (any float), a fuzzy query is blended in as a
|
||||
Should clause boosted at 0.1 — keeping fuzzy hits ranked below exact matches. The fuzzy
|
||||
query uses edit-distance=1, prefix=True, transposition_cost_one=True on all search fields.
|
||||
The threshold float is a post-search minimum-score filter applied in the backend layer, not here.
|
||||
"""
|
||||
|
||||
query_str = rewrite_natural_date_keywords(raw_query, tz)
|
||||
query_str = normalize_query(query_str)
|
||||
|
||||
exact = index.parse_query(
|
||||
query_str,
|
||||
DEFAULT_SEARCH_FIELDS,
|
||||
field_boosts=_FIELD_BOOSTS,
|
||||
)
|
||||
|
||||
threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
|
||||
if threshold is not None:
|
||||
fuzzy = index.parse_query(
|
||||
query_str,
|
||||
DEFAULT_SEARCH_FIELDS,
|
||||
field_boosts=_FIELD_BOOSTS,
|
||||
# (prefix=True, distance=1, transposition_cost_one=True) — edit-distance fuzziness
|
||||
fuzzy_fields={f: (True, 1, True) for f in DEFAULT_SEARCH_FIELDS},
|
||||
)
|
||||
return tantivy.Query.boolean_query(
|
||||
[
|
||||
(tantivy.Occur.Should, exact),
|
||||
# 0.1 boost keeps fuzzy hits ranked below exact matches (intentional)
|
||||
(tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1)),
|
||||
],
|
||||
)
|
||||
|
||||
return exact
|
||||
132
src/documents/search/_schema.py
Normal file
132
src/documents/search/_schema.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tantivy
|
||||
from django.conf import settings
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
def build_schema() -> tantivy.Schema:
|
||||
"""Build the Tantivy schema for the paperless document index."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
|
||||
sb.add_text_field("checksum", stored=True, tokenizer_name="raw")
|
||||
|
||||
for field in (
|
||||
"title",
|
||||
"correspondent",
|
||||
"document_type",
|
||||
"storage_path",
|
||||
"original_filename",
|
||||
"content",
|
||||
):
|
||||
sb.add_text_field(field, stored=True, tokenizer_name="paperless_text")
|
||||
|
||||
# Shadow sort fields - fast, not stored/indexed
|
||||
for field in ("title_sort", "correspondent_sort", "type_sort"):
|
||||
sb.add_text_field(
|
||||
field,
|
||||
stored=False,
|
||||
tokenizer_name="simple_analyzer",
|
||||
fast=True,
|
||||
)
|
||||
|
||||
# CJK support - not stored, indexed only
|
||||
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
|
||||
|
||||
# Autocomplete prefix scan - stored, not indexed
|
||||
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")
|
||||
|
||||
sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text")
|
||||
|
||||
# JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
|
||||
# tantivy-py 0.25 does not support dotted paths in parse_query default_field_names,
|
||||
# so companion text fields (note, custom_field) carry content for default full-text search.
|
||||
sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
|
||||
sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")
|
||||
|
||||
# Companion text fields for default full-text search (not stored — no extra disk cost)
|
||||
sb.add_text_field("note", stored=False, tokenizer_name="paperless_text")
|
||||
sb.add_text_field("custom_field", stored=False, tokenizer_name="paperless_text")
|
||||
|
||||
for field in (
|
||||
"correspondent_id",
|
||||
"document_type_id",
|
||||
"storage_path_id",
|
||||
"tag_id",
|
||||
"owner_id",
|
||||
"viewer_id",
|
||||
):
|
||||
sb.add_unsigned_field(field, stored=False, indexed=True, fast=True)
|
||||
|
||||
for field in ("created", "modified", "added"):
|
||||
sb.add_date_field(field, stored=True, indexed=True, fast=True)
|
||||
|
||||
for field in ("asn", "page_count", "num_notes"):
|
||||
sb.add_unsigned_field(field, stored=True, indexed=True, fast=True)
|
||||
|
||||
return sb.build()
|
||||
|
||||
|
||||
def _needs_rebuild(index_dir: Path) -> bool:
|
||||
"""Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
|
||||
version_file = index_dir / ".schema_version"
|
||||
if not version_file.exists():
|
||||
return True
|
||||
try:
|
||||
if int(version_file.read_text().strip()) != SCHEMA_VERSION:
|
||||
logger.info("Search index schema version mismatch - rebuilding.")
|
||||
return True
|
||||
except ValueError:
|
||||
return True
|
||||
|
||||
language_file = index_dir / ".schema_language"
|
||||
if not language_file.exists():
|
||||
logger.info("Search index language sentinel missing - rebuilding.")
|
||||
return True
|
||||
if language_file.read_text().strip() != (settings.SEARCH_LANGUAGE or ""):
|
||||
logger.info("Search index language changed - rebuilding.")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def wipe_index(index_dir: Path) -> None:
|
||||
"""Delete all children in the index directory to prepare for rebuild."""
|
||||
for child in list(index_dir.iterdir()):
|
||||
if child.is_dir():
|
||||
shutil.rmtree(child)
|
||||
else:
|
||||
child.unlink()
|
||||
|
||||
|
||||
def _write_sentinels(index_dir: Path) -> None:
|
||||
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
|
||||
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
|
||||
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE or "")
|
||||
|
||||
|
||||
def open_or_rebuild_index(index_dir: Path | None = None) -> tantivy.Index:
|
||||
"""
|
||||
Open the Tantivy index at index_dir (defaults to settings.INDEX_DIR),
|
||||
creating or rebuilding as needed.
|
||||
Caller must register custom tokenizers after receiving the Index.
|
||||
"""
|
||||
if index_dir is None:
|
||||
index_dir = settings.INDEX_DIR
|
||||
if _needs_rebuild(index_dir):
|
||||
wipe_index(index_dir)
|
||||
idx = tantivy.Index(build_schema(), path=str(index_dir))
|
||||
_write_sentinels(index_dir)
|
||||
return idx
|
||||
return tantivy.Index.open(str(index_dir))
|
||||
108
src/documents/search/_tokenizer.py
Normal file
108
src/documents/search/_tokenizer.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import tantivy
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
# Mapping of ISO 639-1 codes (and common aliases) -> Tantivy Snowball name
|
||||
_LANGUAGE_MAP: dict[str, str] = {
|
||||
"ar": "Arabic",
|
||||
"arabic": "Arabic",
|
||||
"da": "Danish",
|
||||
"danish": "Danish",
|
||||
"nl": "Dutch",
|
||||
"dutch": "Dutch",
|
||||
"en": "English",
|
||||
"english": "English",
|
||||
"fi": "Finnish",
|
||||
"finnish": "Finnish",
|
||||
"fr": "French",
|
||||
"french": "French",
|
||||
"de": "German",
|
||||
"german": "German",
|
||||
"el": "Greek",
|
||||
"greek": "Greek",
|
||||
"hu": "Hungarian",
|
||||
"hungarian": "Hungarian",
|
||||
"it": "Italian",
|
||||
"italian": "Italian",
|
||||
"no": "Norwegian",
|
||||
"norwegian": "Norwegian",
|
||||
"pt": "Portuguese",
|
||||
"portuguese": "Portuguese",
|
||||
"ro": "Romanian",
|
||||
"romanian": "Romanian",
|
||||
"ru": "Russian",
|
||||
"russian": "Russian",
|
||||
"es": "Spanish",
|
||||
"spanish": "Spanish",
|
||||
"sv": "Swedish",
|
||||
"swedish": "Swedish",
|
||||
"ta": "Tamil",
|
||||
"tamil": "Tamil",
|
||||
"tr": "Turkish",
|
||||
"turkish": "Turkish",
|
||||
}
|
||||
|
||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
"""
|
||||
Register all custom tokenizers on *index*. Must be called on every Index
|
||||
instance — tantivy requires re-registration at each open.
|
||||
|
||||
simple_analyzer is also registered as a fast-field tokenizer because the
|
||||
sort shadow fields (title_sort, correspondent_sort, type_sort) use fast=True.
|
||||
Tantivy writes default values for fast columns on every commit, even for
|
||||
documents that omit those fields, so the fast-field tokenizer must exist.
|
||||
"""
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
# Fast-field tokenizer required for fast=True text fields in the schema
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
|
||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||
builder = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
.filter(tantivy.Filter.remove_long(65))
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
)
|
||||
if language:
|
||||
tantivy_lang = _LANGUAGE_MAP.get(language.lower())
|
||||
if tantivy_lang:
|
||||
builder = builder.filter(tantivy.Filter.stemmer(tantivy_lang))
|
||||
else:
|
||||
logger.warning(
|
||||
"Unsupported search language '%s' - stemming disabled. Supported: %s",
|
||||
language,
|
||||
", ".join(sorted(SUPPORTED_LANGUAGES)),
|
||||
)
|
||||
return builder.build()
|
||||
|
||||
|
||||
def _simple_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(
|
||||
tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),
|
||||
)
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.build()
|
||||
)
|
||||
@@ -1293,22 +1293,18 @@ class SearchResultSerializer(DocumentSerializer):
|
||||
documents = self.context.get("documents")
|
||||
# Otherwise we fetch this document.
|
||||
if documents is None: # pragma: no cover
|
||||
# In practice we only serialize **lists** of whoosh.searching.Hit.
|
||||
# I'm keeping this check for completeness but marking it no cover for now.
|
||||
# In practice we only serialize **lists** of SearchHit dicts.
|
||||
# Keeping this check for completeness but marking it no cover for now.
|
||||
documents = self.fetch_documents([hit["id"]])
|
||||
document = documents[hit["id"]]
|
||||
|
||||
notes = ",".join(
|
||||
[str(c.note) for c in document.notes.all()],
|
||||
)
|
||||
highlights = hit.get("highlights", {})
|
||||
r = super().to_representation(document)
|
||||
r["__search_hit__"] = {
|
||||
"score": hit.score,
|
||||
"highlights": hit.highlights("content", text=document.content),
|
||||
"note_highlights": (
|
||||
hit.highlights("notes", text=notes) if document else None
|
||||
),
|
||||
"rank": hit.rank,
|
||||
"score": hit["score"],
|
||||
"highlights": highlights.get("content", ""),
|
||||
"note_highlights": highlights.get("notes") or None,
|
||||
"rank": hit["rank"],
|
||||
}
|
||||
|
||||
return r
|
||||
|
||||
@@ -790,12 +790,12 @@ def cleanup_user_deletion(sender, instance: User | Group, **kwargs) -> None:
|
||||
|
||||
|
||||
def add_to_index(sender, document, **kwargs) -> None:
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.add_or_update_document(document)
|
||||
get_backend().add_or_update(document)
|
||||
if document.root_document_id is not None and document.root_document is not None:
|
||||
# keep in sync when a new version is consumed.
|
||||
index.add_or_update_document(
|
||||
get_backend().add_or_update(
|
||||
document.root_document,
|
||||
effective_content=document.content,
|
||||
)
|
||||
|
||||
@@ -20,9 +20,7 @@ from django.db import transaction
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents import sanity_checker
|
||||
from documents.barcodes import BarcodePlugin
|
||||
from documents.bulk_download import ArchiveOnlyStrategy
|
||||
@@ -84,19 +82,24 @@ def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
|
||||
@shared_task
|
||||
def index_optimize() -> None:
|
||||
ix = index.open_index()
|
||||
writer = AsyncWriter(ix)
|
||||
writer.commit(optimize=True)
|
||||
logger.info(
|
||||
"document_index optimize is deprecated — Tantivy manages "
|
||||
"segment merging automatically.",
|
||||
)
|
||||
|
||||
|
||||
def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
|
||||
documents = Document.objects.all()
|
||||
from documents.search import get_backend
|
||||
from documents.search import reset_backend
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in iter_wrapper(documents):
|
||||
index.update_document(writer, document)
|
||||
documents = Document.objects.select_related(
|
||||
"correspondent",
|
||||
"document_type",
|
||||
"storage_path",
|
||||
"owner",
|
||||
).prefetch_related("tags", "notes", "custom_fields")
|
||||
get_backend().rebuild(documents, iter_wrapper=iter_wrapper)
|
||||
reset_backend()
|
||||
|
||||
|
||||
@shared_task
|
||||
@@ -270,9 +273,9 @@ def sanity_check(*, scheduled=True, raise_on_error=True):
|
||||
|
||||
@shared_task
|
||||
def bulk_update_documents(document_ids) -> None:
|
||||
documents = Document.objects.filter(id__in=document_ids)
|
||||
from documents.search import get_backend
|
||||
|
||||
ix = index.open_index()
|
||||
documents = Document.objects.filter(id__in=document_ids)
|
||||
|
||||
for doc in documents:
|
||||
clear_document_caches(doc.pk)
|
||||
@@ -283,9 +286,9 @@ def bulk_update_documents(document_ids) -> None:
|
||||
)
|
||||
post_save.send(Document, instance=doc, created=False)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
with get_backend().batch_update() as batch:
|
||||
for doc in documents:
|
||||
index.update_document(writer, doc)
|
||||
batch.add_or_update(doc)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
@@ -389,8 +392,9 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
from documents.search import get_backend
|
||||
|
||||
get_backend().add_or_update(document)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
|
||||
0
src/documents/tests/search/__init__.py
Normal file
0
src/documents/tests/search/__init__.py
Normal file
33
src/documents/tests/search/conftest.py
Normal file
33
src/documents/tests/search/conftest.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import reset_backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def index_dir(tmp_path: Path, settings: SettingsWrapper) -> Path:
|
||||
path = tmp_path / "index"
|
||||
path.mkdir()
|
||||
settings.INDEX_DIR = path
|
||||
return path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def backend() -> Generator[TantivyBackend, None, None]:
|
||||
b = TantivyBackend() # path=None → in-memory index
|
||||
b.open()
|
||||
try:
|
||||
yield b
|
||||
finally:
|
||||
b.close()
|
||||
reset_backend()
|
||||
317
src/documents/tests/search/test_backend.py
Normal file
317
src/documents/tests/search/test_backend.py
Normal file
@@ -0,0 +1,317 @@
|
||||
import pytest
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
|
||||
pytestmark = [pytest.mark.search, pytest.mark.django_db]
|
||||
|
||||
|
||||
class TestWriteBatch:
|
||||
"""Test WriteBatch context manager functionality."""
|
||||
|
||||
def test_rolls_back_on_exception(self, backend: TantivyBackend):
|
||||
"""Data integrity: a mid-batch exception must not corrupt the index."""
|
||||
doc = Document.objects.create(
|
||||
title="Rollback Target",
|
||||
content="should survive",
|
||||
checksum="RB1",
|
||||
pk=1,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
try:
|
||||
with backend.batch_update() as batch:
|
||||
batch.remove(doc.pk)
|
||||
raise RuntimeError("simulated failure")
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
r = backend.search(
|
||||
"should survive",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert r.total == 1
|
||||
|
||||
|
||||
class TestSearch:
|
||||
"""Test search functionality."""
|
||||
|
||||
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
|
||||
"""UI score bar depends on the top hit being 1.0."""
|
||||
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
|
||||
doc = Document.objects.create(
|
||||
title=title,
|
||||
content=title,
|
||||
checksum=f"SN{i}",
|
||||
pk=10 + i,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
r = backend.search(
|
||||
"bank",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert r.hits[0]["score"] == pytest.approx(1.0)
|
||||
assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)
|
||||
|
||||
def test_owner_filter(self, backend: TantivyBackend):
|
||||
"""Owner can find their document; other user cannot."""
|
||||
owner = User.objects.create_user("owner")
|
||||
other = User.objects.create_user("other")
|
||||
doc = Document.objects.create(
|
||||
title="Private",
|
||||
content="secret",
|
||||
checksum="PF1",
|
||||
pk=20,
|
||||
owner=owner,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert (
|
||||
backend.search(
|
||||
"secret",
|
||||
user=owner,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
).total
|
||||
== 1
|
||||
)
|
||||
assert (
|
||||
backend.search(
|
||||
"secret",
|
||||
user=other,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
).total
|
||||
== 0
|
||||
)
|
||||
|
||||
|
||||
class TestRebuild:
|
||||
"""Test index rebuilding functionality."""
|
||||
|
||||
def test_with_iter_wrapper_called(self, backend: TantivyBackend):
|
||||
"""rebuild() must pass documents through iter_wrapper."""
|
||||
seen = []
|
||||
|
||||
def wrapper(docs):
|
||||
for doc in docs:
|
||||
seen.append(doc.pk)
|
||||
yield doc
|
||||
|
||||
Document.objects.create(title="Tracked", content="x", checksum="TW1", pk=30)
|
||||
backend.rebuild(Document.objects.all(), iter_wrapper=wrapper)
|
||||
assert 30 in seen
|
||||
|
||||
|
||||
class TestAutocomplete:
|
||||
"""Test autocomplete functionality."""
|
||||
|
||||
def test_basic_functionality(self, backend: TantivyBackend):
|
||||
"""Autocomplete should find word prefixes."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice from Microsoft Corporation",
|
||||
content="payment details",
|
||||
checksum="AC1",
|
||||
pk=40,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
results = backend.autocomplete("micro", limit=10)
|
||||
assert "microsoft" in results
|
||||
|
||||
def test_results_ordered_by_document_frequency(self, backend: TantivyBackend):
|
||||
"""Most-used prefix match should rank first."""
|
||||
# "payment" appears in 3 docs; "payslip" in 1 — "pay" prefix should
|
||||
# return "payment" before "payslip".
|
||||
for i, (title, checksum) in enumerate(
|
||||
[
|
||||
("payment invoice", "AF1"),
|
||||
("payment receipt", "AF2"),
|
||||
("payment confirmation", "AF3"),
|
||||
("payslip march", "AF4"),
|
||||
],
|
||||
start=41,
|
||||
):
|
||||
doc = Document.objects.create(
|
||||
title=title,
|
||||
content="details",
|
||||
checksum=checksum,
|
||||
pk=i,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
results = backend.autocomplete("pay", limit=10)
|
||||
assert results.index("payment") < results.index("payslip")
|
||||
|
||||
|
||||
class TestMoreLikeThis:
|
||||
"""Test more like this functionality."""
|
||||
|
||||
def test_excludes_original(self, backend: TantivyBackend):
|
||||
"""More like this should not return the original document."""
|
||||
doc1 = Document.objects.create(
|
||||
title="Important document",
|
||||
content="financial information",
|
||||
checksum="MLT1",
|
||||
pk=50,
|
||||
)
|
||||
doc2 = Document.objects.create(
|
||||
title="Another document",
|
||||
content="financial report",
|
||||
checksum="MLT2",
|
||||
pk=51,
|
||||
)
|
||||
backend.add_or_update(doc1)
|
||||
backend.add_or_update(doc2)
|
||||
|
||||
results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10)
|
||||
returned_ids = [hit["id"] for hit in results.hits]
|
||||
assert 50 not in returned_ids # Original document excluded
|
||||
|
||||
|
||||
class TestSingleton:
|
||||
"""Test get_backend() and reset_backend() singleton lifecycle."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clean(self):
|
||||
reset_backend()
|
||||
yield
|
||||
reset_backend()
|
||||
|
||||
def test_returns_same_instance_on_repeated_calls(self, index_dir):
|
||||
assert get_backend() is get_backend()
|
||||
|
||||
def test_reinitializes_when_index_dir_changes(self, tmp_path, settings):
|
||||
settings.INDEX_DIR = tmp_path / "a"
|
||||
(tmp_path / "a").mkdir()
|
||||
b1 = get_backend()
|
||||
|
||||
settings.INDEX_DIR = tmp_path / "b"
|
||||
(tmp_path / "b").mkdir()
|
||||
b2 = get_backend()
|
||||
|
||||
assert b1 is not b2
|
||||
assert b2._path == tmp_path / "b"
|
||||
|
||||
def test_reset_forces_new_instance(self, index_dir):
|
||||
b1 = get_backend()
|
||||
reset_backend()
|
||||
b2 = get_backend()
|
||||
assert b1 is not b2
|
||||
|
||||
|
||||
class TestFieldHandling:
|
||||
"""Test handling of various document fields."""
|
||||
|
||||
def test_none_values_handled_correctly(self, backend: TantivyBackend):
|
||||
"""Test that None values for original_filename and page_count are handled properly."""
|
||||
doc = Document.objects.create(
|
||||
title="Test Doc",
|
||||
content="test content",
|
||||
checksum="NV1",
|
||||
pk=60,
|
||||
original_filename=None,
|
||||
page_count=None,
|
||||
)
|
||||
# Should not raise an exception
|
||||
backend.add_or_update(doc)
|
||||
|
||||
results = backend.search(
|
||||
"test",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert results.total == 1
|
||||
|
||||
def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
|
||||
"""Custom field indexing should include both name and value."""
|
||||
# Create a custom field
|
||||
field = CustomField.objects.create(
|
||||
name="Invoice Number",
|
||||
data_type=CustomField.FieldDataType.STRING,
|
||||
)
|
||||
doc = Document.objects.create(
|
||||
title="Invoice",
|
||||
content="test",
|
||||
checksum="CF1",
|
||||
pk=70,
|
||||
)
|
||||
CustomFieldInstance.objects.create(
|
||||
document=doc,
|
||||
field=field,
|
||||
value_text="INV-2024-001",
|
||||
)
|
||||
|
||||
# Should not raise an exception during indexing
|
||||
backend.add_or_update(doc)
|
||||
|
||||
results = backend.search(
|
||||
"invoice",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert results.total == 1
|
||||
|
||||
def test_notes_include_user_information(self, backend: TantivyBackend):
|
||||
"""Notes should include user information when available."""
|
||||
user = User.objects.create_user("notewriter")
|
||||
doc = Document.objects.create(
|
||||
title="Doc with notes",
|
||||
content="test",
|
||||
checksum="NT1",
|
||||
pk=80,
|
||||
)
|
||||
Note.objects.create(document=doc, note="Important note", user=user)
|
||||
|
||||
# Should not raise an exception during indexing
|
||||
backend.add_or_update(doc)
|
||||
|
||||
# Test basic document search first
|
||||
results = backend.search(
|
||||
"test",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert results.total == 1, (
|
||||
f"Expected 1, got {results.total}. Document content should be searchable."
|
||||
)
|
||||
|
||||
# Test notes search
|
||||
results = backend.search(
|
||||
"important",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert results.total == 1, (
|
||||
f"Expected 1, got {results.total}. Note content should be searchable."
|
||||
)
|
||||
382
src/documents/tests/search/test_query.py
Normal file
382
src/documents/tests/search/test_query.py
Normal file
@@ -0,0 +1,382 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from datetime import tzinfo
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pytest
|
||||
import tantivy
|
||||
import time_machine
|
||||
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import normalize_query
|
||||
from documents.search._query import parse_user_query
|
||||
from documents.search._query import rewrite_natural_date_keywords
|
||||
from documents.search._schema import build_schema
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
pytestmark = pytest.mark.search
|
||||
|
||||
UTC = UTC
|
||||
EASTERN = ZoneInfo("America/New_York") # UTC-5 / UTC-4 (DST)
|
||||
AUCKLAND = ZoneInfo("Pacific/Auckland") # UTC+13 in southern-hemisphere summer
|
||||
|
||||
|
||||
def _range(result: str, field: str) -> tuple[str, str]:
|
||||
m = re.search(rf"{field}:\[(.+?) TO (.+?)\]", result)
|
||||
assert m, f"No range for {field!r} in: {result!r}"
|
||||
return m.group(1), m.group(2)
|
||||
|
||||
|
||||
class TestCreatedDateField:
|
||||
"""
|
||||
created is a Django DateField: indexed as midnight UTC of the local calendar
|
||||
date. No offset arithmetic needed - the local calendar date is what matters.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("tz", "expected_lo", "expected_hi"),
|
||||
[
|
||||
pytest.param(UTC, "2026-03-28T00:00:00Z", "2026-03-29T00:00:00Z", id="utc"),
|
||||
pytest.param(
|
||||
EASTERN,
|
||||
"2026-03-28T00:00:00Z",
|
||||
"2026-03-29T00:00:00Z",
|
||||
id="eastern_same_calendar_date",
|
||||
),
|
||||
],
|
||||
)
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
|
||||
def test_today(self, tz: tzinfo, expected_lo: str, expected_hi: str) -> None:
|
||||
lo, hi = _range(rewrite_natural_date_keywords("created:today", tz), "created")
|
||||
assert lo == expected_lo
|
||||
assert hi == expected_hi
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 3, 0, tzinfo=UTC), tick=False)
|
||||
def test_today_auckland_ahead_of_utc(self) -> None:
|
||||
# UTC 03:00 -> Auckland (UTC+13) = 16:00 same date; local date = 2026-03-28
|
||||
lo, _ = _range(
|
||||
rewrite_natural_date_keywords("created:today", AUCKLAND),
|
||||
"created",
|
||||
)
|
||||
assert lo == "2026-03-28T00:00:00Z"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("field", "keyword", "expected_lo", "expected_hi"),
|
||||
[
|
||||
pytest.param(
|
||||
"created",
|
||||
"yesterday",
|
||||
"2026-03-27T00:00:00Z",
|
||||
"2026-03-28T00:00:00Z",
|
||||
id="yesterday",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"this_week",
|
||||
"2026-03-23T00:00:00Z",
|
||||
"2026-03-30T00:00:00Z",
|
||||
id="this_week_mon_sun",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"last_week",
|
||||
"2026-03-16T00:00:00Z",
|
||||
"2026-03-23T00:00:00Z",
|
||||
id="last_week",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"this_month",
|
||||
"2026-03-01T00:00:00Z",
|
||||
"2026-04-01T00:00:00Z",
|
||||
id="this_month",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"last_month",
|
||||
"2026-02-01T00:00:00Z",
|
||||
"2026-03-01T00:00:00Z",
|
||||
id="last_month",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"this_year",
|
||||
"2026-01-01T00:00:00Z",
|
||||
"2027-01-01T00:00:00Z",
|
||||
id="this_year",
|
||||
),
|
||||
pytest.param(
|
||||
"created",
|
||||
"last_year",
|
||||
"2025-01-01T00:00:00Z",
|
||||
"2026-01-01T00:00:00Z",
|
||||
id="last_year",
|
||||
),
|
||||
],
|
||||
)
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
|
||||
def test_date_keywords(
|
||||
self,
|
||||
field: str,
|
||||
keyword: str,
|
||||
expected_lo: str,
|
||||
expected_hi: str,
|
||||
) -> None:
|
||||
# 2026-03-28 is Saturday; Mon-Sun week calculation built into expectations
|
||||
query = f"{field}:{keyword}"
|
||||
lo, hi = _range(rewrite_natural_date_keywords(query, UTC), field)
|
||||
assert lo == expected_lo
|
||||
assert hi == expected_hi
|
||||
|
||||
|
||||
class TestDateTimeFields:
|
||||
"""
|
||||
added/modified store full UTC datetimes. Natural keywords must convert
|
||||
the local day boundaries to UTC - timezone offset arithmetic IS required.
|
||||
"""
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
|
||||
def test_added_today_eastern(self) -> None:
|
||||
# EDT = UTC-4; local midnight 2026-03-28 00:00 EDT = 2026-03-28 04:00 UTC
|
||||
lo, hi = _range(rewrite_natural_date_keywords("added:today", EASTERN), "added")
|
||||
assert lo == "2026-03-28T04:00:00Z"
|
||||
assert hi == "2026-03-29T04:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 29, 2, 0, tzinfo=UTC), tick=False)
|
||||
def test_added_today_auckland_midnight_crossing(self) -> None:
|
||||
# UTC 02:00 on 2026-03-29 -> Auckland (UTC+13) = 2026-03-29 15:00 local
|
||||
# Auckland midnight = UTC 2026-03-28 11:00
|
||||
lo, hi = _range(rewrite_natural_date_keywords("added:today", AUCKLAND), "added")
|
||||
assert lo == "2026-03-28T11:00:00Z"
|
||||
assert hi == "2026-03-29T11:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
|
||||
def test_modified_today_utc(self) -> None:
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("modified:today", UTC),
|
||||
"modified",
|
||||
)
|
||||
assert lo == "2026-03-28T00:00:00Z"
|
||||
assert hi == "2026-03-29T00:00:00Z"
|
||||
|
||||
|
||||
class TestWhooshQueryRewriting:
|
||||
"""All Whoosh query syntax variants must be rewritten to ISO 8601 before Tantivy parses them."""
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
|
||||
def test_compact_date_shim_rewrites_to_iso(self) -> None:
|
||||
result = rewrite_natural_date_keywords("created:20240115120000", UTC)
|
||||
assert "2024-01-15" in result
|
||||
assert "20240115120000" not in result
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
|
||||
def test_relative_range_shim_removes_now(self) -> None:
|
||||
result = rewrite_natural_date_keywords("added:[now-7d TO now]", UTC)
|
||||
assert "now" not in result
|
||||
assert "2026-03-" in result
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_minus_7_days(self) -> None:
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("added:[-7 days to now]", UTC),
|
||||
"added",
|
||||
)
|
||||
assert lo == "2026-03-21T12:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_minus_1_week(self) -> None:
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("added:[-1 week to now]", UTC),
|
||||
"added",
|
||||
)
|
||||
assert lo == "2026-03-21T12:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_minus_1_month_uses_relativedelta(self) -> None:
|
||||
# relativedelta(months=1) from 2026-03-28 = 2026-02-28 (not 29)
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("created:[-1 month to now]", UTC),
|
||||
"created",
|
||||
)
|
||||
assert lo == "2026-02-28T12:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_minus_1_year(self) -> None:
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("modified:[-1 year to now]", UTC),
|
||||
"modified",
|
||||
)
|
||||
assert lo == "2025-03-28T12:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_plural_unit_hours(self) -> None:
|
||||
lo, hi = _range(
|
||||
rewrite_natural_date_keywords("added:[-3 hours to now]", UTC),
|
||||
"added",
|
||||
)
|
||||
assert lo == "2026-03-28T09:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
@time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False)
|
||||
def test_bracket_case_insensitive(self) -> None:
|
||||
result = rewrite_natural_date_keywords("added:[-1 WEEK TO NOW]", UTC)
|
||||
assert "now" not in result.lower()
|
||||
lo, hi = _range(result, "added")
|
||||
assert lo == "2026-03-21T12:00:00Z"
|
||||
assert hi == "2026-03-28T12:00:00Z"
|
||||
|
||||
def test_8digit_created_date_field_always_uses_utc_midnight(self) -> None:
|
||||
# created is a DateField: boundaries are always UTC midnight, no TZ offset
|
||||
result = rewrite_natural_date_keywords("created:20231201", EASTERN)
|
||||
lo, hi = _range(result, "created")
|
||||
assert lo == "2023-12-01T00:00:00Z"
|
||||
assert hi == "2023-12-02T00:00:00Z"
|
||||
|
||||
def test_8digit_added_datetime_field_converts_local_midnight_to_utc(self) -> None:
|
||||
# added is DateTimeField: midnight Dec 1 Eastern (EST = UTC-5) = 05:00 UTC
|
||||
result = rewrite_natural_date_keywords("added:20231201", EASTERN)
|
||||
lo, hi = _range(result, "added")
|
||||
assert lo == "2023-12-01T05:00:00Z"
|
||||
assert hi == "2023-12-02T05:00:00Z"
|
||||
|
||||
def test_8digit_modified_datetime_field_converts_local_midnight_to_utc(
|
||||
self,
|
||||
) -> None:
|
||||
result = rewrite_natural_date_keywords("modified:20231201", EASTERN)
|
||||
lo, hi = _range(result, "modified")
|
||||
assert lo == "2023-12-01T05:00:00Z"
|
||||
assert hi == "2023-12-02T05:00:00Z"
|
||||
|
||||
def test_8digit_invalid_date_passes_through_unchanged(self) -> None:
|
||||
assert rewrite_natural_date_keywords("added:20231340", UTC) == "added:20231340"
|
||||
|
||||
|
||||
class TestParseUserQuery:
|
||||
"""parse_user_query runs the full preprocessing pipeline."""
|
||||
|
||||
@pytest.fixture
|
||||
def query_index(self) -> tantivy.Index:
|
||||
schema = build_schema()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
register_tokenizers(idx, "")
|
||||
return idx
|
||||
|
||||
def test_returns_tantivy_query(self, query_index: tantivy.Index) -> None:
|
||||
assert isinstance(parse_user_query(query_index, "invoice", UTC), tantivy.Query)
|
||||
|
||||
def test_fuzzy_mode_does_not_raise(
|
||||
self,
|
||||
query_index: tantivy.Index,
|
||||
settings,
|
||||
) -> None:
|
||||
settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 0.5
|
||||
assert isinstance(parse_user_query(query_index, "invoice", UTC), tantivy.Query)
|
||||
|
||||
def test_date_rewriting_applied_before_tantivy_parse(
|
||||
self,
|
||||
query_index: tantivy.Index,
|
||||
) -> None:
|
||||
# created:today must be rewritten to an ISO range before Tantivy parses it;
|
||||
# if passed raw, Tantivy would reject "today" as an invalid date value
|
||||
with time_machine.travel(datetime(2026, 3, 28, 12, 0, tzinfo=UTC), tick=False):
|
||||
q = parse_user_query(query_index, "created:today", UTC)
|
||||
assert isinstance(q, tantivy.Query)
|
||||
|
||||
|
||||
class TestPassthrough:
|
||||
"""Queries without field prefixes or unrelated content pass through unchanged."""
|
||||
|
||||
def test_bare_keyword_no_field_prefix_unchanged(self) -> None:
|
||||
# Bare 'today' with no field: prefix passes through unchanged
|
||||
result = rewrite_natural_date_keywords("bank statement today", UTC)
|
||||
assert "today" in result
|
||||
|
||||
def test_unrelated_query_unchanged(self) -> None:
|
||||
assert rewrite_natural_date_keywords("title:invoice", UTC) == "title:invoice"
|
||||
|
||||
|
||||
class TestNormalizeQuery:
|
||||
"""normalize_query expands comma-separated values and collapses whitespace."""
|
||||
|
||||
def test_normalize_expands_comma_separated_tags(self) -> None:
|
||||
assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar"
|
||||
|
||||
def test_normalize_expands_three_values(self) -> None:
|
||||
assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz"
|
||||
|
||||
def test_normalize_collapses_whitespace(self) -> None:
|
||||
assert normalize_query("bank statement") == "bank statement"
|
||||
|
||||
def test_normalize_no_commas_unchanged(self) -> None:
|
||||
assert normalize_query("bank statement") == "bank statement"
|
||||
|
||||
|
||||
class TestPermissionFilter:
|
||||
"""build_permission_filter tests use an in-memory index — no DB access needed."""
|
||||
|
||||
@pytest.fixture
|
||||
def perm_index(self) -> tantivy.Index:
|
||||
schema = build_schema()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
register_tokenizers(idx, "")
|
||||
return idx
|
||||
|
||||
def _add_doc(
|
||||
self,
|
||||
idx: tantivy.Index,
|
||||
doc_id: int,
|
||||
owner_id: int | None = None,
|
||||
viewer_ids: tuple[int, ...] = (),
|
||||
) -> None:
|
||||
writer = idx.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_unsigned("id", doc_id)
|
||||
# Only add owner_id field if the document has an owner
|
||||
if owner_id is not None:
|
||||
doc.add_unsigned("owner_id", owner_id)
|
||||
for vid in viewer_ids:
|
||||
doc.add_unsigned("viewer_id", vid)
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
idx.reload()
|
||||
|
||||
def test_perm_no_owner_visible_to_any_user(self, perm_index: tantivy.Index) -> None:
|
||||
self._add_doc(perm_index, doc_id=1, owner_id=None)
|
||||
user = type("U", (), {"pk": 99})()
|
||||
perm = build_permission_filter(perm_index.schema, user) # .schema is a property
|
||||
assert perm_index.searcher().search(perm, limit=10).count == 1
|
||||
|
||||
def test_perm_owned_by_user_is_visible(self, perm_index: tantivy.Index) -> None:
|
||||
self._add_doc(perm_index, doc_id=2, owner_id=42)
|
||||
user = type("U", (), {"pk": 42})()
|
||||
perm = build_permission_filter(perm_index.schema, user)
|
||||
assert perm_index.searcher().search(perm, limit=10).count == 1
|
||||
|
||||
def test_perm_owned_by_other_not_visible(self, perm_index: tantivy.Index) -> None:
|
||||
self._add_doc(perm_index, doc_id=3, owner_id=42)
|
||||
user = type("U", (), {"pk": 99})()
|
||||
perm = build_permission_filter(perm_index.schema, user)
|
||||
assert perm_index.searcher().search(perm, limit=10).count == 0
|
||||
|
||||
def test_perm_shared_viewer_is_visible(self, perm_index: tantivy.Index) -> None:
|
||||
self._add_doc(perm_index, doc_id=4, owner_id=42, viewer_ids=(99,))
|
||||
user = type("U", (), {"pk": 99})()
|
||||
perm = build_permission_filter(perm_index.schema, user)
|
||||
assert perm_index.searcher().search(perm, limit=10).count == 1
|
||||
|
||||
def test_perm_only_owned_docs_hidden_from_others(
|
||||
self,
|
||||
perm_index: tantivy.Index,
|
||||
) -> None:
|
||||
self._add_doc(perm_index, doc_id=5, owner_id=10) # owned by 10
|
||||
self._add_doc(perm_index, doc_id=6, owner_id=None) # unowned
|
||||
user = type("U", (), {"pk": 20})()
|
||||
perm = build_permission_filter(perm_index.schema, user)
|
||||
assert perm_index.searcher().search(perm, limit=10).count == 1 # only unowned
|
||||
77
src/documents/tests/search/test_tokenizer.py
Normal file
77
src/documents/tests/search/test_tokenizer.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import tantivy
|
||||
|
||||
from documents.search._tokenizer import _bigram_analyzer
|
||||
from documents.search._tokenizer import _paperless_text
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _pytest.logging import LogCaptureFixture
|
||||
|
||||
pytestmark = pytest.mark.search
|
||||
|
||||
|
||||
class TestTokenizers:
|
||||
@pytest.fixture
|
||||
def content_index(self) -> tantivy.Index:
|
||||
"""Index with just a content field for ASCII folding tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("paperless_text", _paperless_text(""))
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def bigram_index(self) -> tantivy.Index:
|
||||
"""Index with bigram field for CJK tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field(
|
||||
"bigram_content",
|
||||
stored=False,
|
||||
tokenizer_name="bigram_analyzer",
|
||||
)
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
return idx
|
||||
|
||||
def test_ascii_fold_finds_accented_content(
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""paperless_text normalises diacritics so café is findable as cafe."""
|
||||
writer = content_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("content", "café résumé")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
content_index.reload()
|
||||
q = content_index.parse_query("cafe resume", ["content"])
|
||||
assert content_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
|
||||
"""bigram_analyzer makes CJK substrings searchable without whitespace."""
|
||||
writer = bigram_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("bigram_content", "東京都")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
bigram_index.reload()
|
||||
q = bigram_index.parse_query("東京", ["bigram_content"])
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
|
||||
with caplog.at_level(logging.WARNING, logger="paperless.search"):
|
||||
register_tokenizers(idx, "klingon")
|
||||
assert "klingon" in caplog.text
|
||||
@@ -1,6 +1,7 @@
|
||||
import types
|
||||
from unittest.mock import patch
|
||||
|
||||
import tantivy
|
||||
from django.contrib.admin.sites import AdminSite
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
@@ -8,36 +9,54 @@ from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
from rest_framework import status
|
||||
|
||||
from documents import index
|
||||
from documents.admin import DocumentAdmin
|
||||
from documents.admin import TagAdmin
|
||||
from documents.models import Document
|
||||
from documents.models import Tag
|
||||
from documents.search import get_backend
|
||||
from documents.search import reset_backend
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless.admin import PaperlessUserAdmin
|
||||
|
||||
|
||||
class TestDocumentAdmin(DirectoriesMixin, TestCase):
|
||||
def get_document_from_index(self, doc):
|
||||
ix = index.open_index()
|
||||
with ix.searcher() as searcher:
|
||||
return searcher.document(id=doc.id)
|
||||
backend = get_backend()
|
||||
searcher = backend._index.searcher()
|
||||
results = searcher.search(
|
||||
tantivy.Query.range_query(
|
||||
backend._schema,
|
||||
"id",
|
||||
tantivy.FieldType.Unsigned,
|
||||
doc.pk,
|
||||
doc.pk,
|
||||
),
|
||||
limit=1,
|
||||
)
|
||||
if results.hits:
|
||||
return searcher.doc(results.hits[0][1]).to_dict()
|
||||
return None
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
reset_backend()
|
||||
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
|
||||
|
||||
def tearDown(self) -> None:
|
||||
reset_backend()
|
||||
super().tearDown()
|
||||
|
||||
def test_save_model(self) -> None:
|
||||
doc = Document.objects.create(title="test")
|
||||
|
||||
doc.title = "new title"
|
||||
self.doc_admin.save_model(None, doc, None, None)
|
||||
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
|
||||
self.assertEqual(self.get_document_from_index(doc)["id"], doc.id)
|
||||
self.assertEqual(self.get_document_from_index(doc)["id"], [doc.id])
|
||||
|
||||
def test_delete_model(self) -> None:
|
||||
doc = Document.objects.create(title="test")
|
||||
index.add_or_update_document(doc)
|
||||
get_backend().add_or_update(doc)
|
||||
self.assertIsNotNone(self.get_document_from_index(doc))
|
||||
|
||||
self.doc_admin.delete_model(None, doc)
|
||||
@@ -53,7 +72,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
|
||||
checksum=f"{i:02}",
|
||||
)
|
||||
docs.append(doc)
|
||||
index.add_or_update_document(doc)
|
||||
get_backend().add_or_update(doc)
|
||||
|
||||
self.assertEqual(Document.objects.count(), 42)
|
||||
|
||||
|
||||
@@ -109,7 +109,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
|
||||
with mock.patch("documents.index.remove_document_from_index"):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(f"/api/documents/{root.id}/versions/{root.id}/")
|
||||
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
@@ -137,10 +137,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
content="v2-content",
|
||||
)
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.remove_document_from_index"),
|
||||
mock.patch("documents.index.add_or_update_document"),
|
||||
):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(f"/api/documents/{root.id}/versions/{v2.id}/")
|
||||
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
@@ -149,10 +146,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
root.refresh_from_db()
|
||||
self.assertEqual(root.content, "root-content")
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.remove_document_from_index"),
|
||||
mock.patch("documents.index.add_or_update_document"),
|
||||
):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(f"/api/documents/{root.id}/versions/{v1.id}/")
|
||||
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
@@ -175,10 +169,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
)
|
||||
version_id = version.id
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.remove_document_from_index"),
|
||||
mock.patch("documents.index.add_or_update_document"),
|
||||
):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(
|
||||
f"/api/documents/{root.id}/versions/{version_id}/",
|
||||
)
|
||||
@@ -225,7 +216,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
root_document=other_root,
|
||||
)
|
||||
|
||||
with mock.patch("documents.index.remove_document_from_index"):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(
|
||||
f"/api/documents/{root.id}/versions/{other_version.id}/",
|
||||
)
|
||||
@@ -245,10 +236,7 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
root_document=root,
|
||||
)
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.remove_document_from_index"),
|
||||
mock.patch("documents.index.add_or_update_document"),
|
||||
):
|
||||
with mock.patch("documents.search.get_backend"):
|
||||
resp = self.client.delete(
|
||||
f"/api/documents/{version.id}/versions/{version.id}/",
|
||||
)
|
||||
@@ -275,18 +263,17 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
|
||||
root_document=root,
|
||||
)
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.remove_document_from_index") as remove_index,
|
||||
mock.patch("documents.index.add_or_update_document") as add_or_update,
|
||||
):
|
||||
with mock.patch("documents.search.get_backend") as mock_get_backend:
|
||||
mock_backend = mock.MagicMock()
|
||||
mock_get_backend.return_value = mock_backend
|
||||
resp = self.client.delete(
|
||||
f"/api/documents/{root.id}/versions/{version.id}/",
|
||||
)
|
||||
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
remove_index.assert_called_once_with(version)
|
||||
add_or_update.assert_called_once()
|
||||
self.assertEqual(add_or_update.call_args[0][0].id, root.id)
|
||||
mock_backend.remove.assert_called_once_with(version.pk)
|
||||
mock_backend.add_or_update.assert_called_once()
|
||||
self.assertEqual(mock_backend.add_or_update.call_args[0][0].id, root.id)
|
||||
|
||||
def test_delete_version_returns_403_without_permission(self) -> None:
|
||||
owner = User.objects.create_user(username="owner")
|
||||
|
||||
@@ -11,9 +11,7 @@ from django.utils import timezone
|
||||
from guardian.shortcuts import assign_perm
|
||||
from rest_framework import status
|
||||
from rest_framework.test import APITestCase
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents.bulk_edit import set_permissions
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
@@ -25,6 +23,8 @@ from documents.models import SavedView
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import Workflow
|
||||
from documents.search import get_backend
|
||||
from documents.search import reset_backend
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
@@ -33,10 +33,15 @@ from paperless_mail.models import MailRule
|
||||
class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
reset_backend()
|
||||
|
||||
self.user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_authenticate(user=self.user)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
reset_backend()
|
||||
super().tearDown()
|
||||
|
||||
def test_search(self) -> None:
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
@@ -57,13 +62,11 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
checksum="C",
|
||||
original_filename="someepdf.pdf",
|
||||
)
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
# Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
|
||||
# (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
|
||||
# That's why we can't open the writer in a model on_save handler or something.
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=bank")
|
||||
results = response.data["results"]
|
||||
self.assertEqual(response.data["count"], 3)
|
||||
@@ -125,10 +128,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
value_int=20,
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get(
|
||||
f"/api/documents/?query=match&ordering=custom_field_{custom_field.pk}",
|
||||
@@ -149,15 +152,15 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
)
|
||||
|
||||
def test_search_multi_page(self) -> None:
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content="content",
|
||||
)
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content="content",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
# This is here so that we test that no document gets returned twice (might happen if the paging is not working)
|
||||
seen_ids = []
|
||||
@@ -184,15 +187,15 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
seen_ids.append(result["id"])
|
||||
|
||||
def test_search_invalid_page(self) -> None:
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(15):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content="content",
|
||||
)
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for i in range(15):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content="content",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
response = self.client.get("/api/documents/?query=content&page=0&page_size=10")
|
||||
self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
|
||||
@@ -230,26 +233,25 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
pk=3,
|
||||
checksum="C",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
# Expect 3 documents returned
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
result_map = {r["id"]: r for r in results}
|
||||
self.assertEqual(set(result_map.keys()), {1, 2, 3})
|
||||
for subset in [
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
]:
|
||||
r = result_map[subset["id"]]
|
||||
self.assertDictEqual(r, {**r, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Chicago",
|
||||
@@ -285,10 +287,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
@@ -296,12 +298,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
result_map = {r["id"]: r for r in results}
|
||||
self.assertEqual(set(result_map.keys()), {1, 2})
|
||||
for subset in [
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
]:
|
||||
r = result_map[subset["id"]]
|
||||
self.assertDictEqual(r, {**r, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="Europe/Sofia",
|
||||
@@ -337,10 +341,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
@@ -348,12 +352,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
result_map = {r["id"]: r for r in results}
|
||||
self.assertEqual(set(result_map.keys()), {1, 2})
|
||||
for subset in [
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
]:
|
||||
r = result_map[subset["id"]]
|
||||
self.assertDictEqual(r, {**r, **subset})
|
||||
|
||||
def test_search_added_in_last_month(self) -> None:
|
||||
"""
|
||||
@@ -389,10 +395,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
@@ -400,12 +406,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
result_map = {r["id"]: r for r in results}
|
||||
self.assertEqual(set(result_map.keys()), {1, 3})
|
||||
for subset in [
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
]:
|
||||
r = result_map[subset["id"]]
|
||||
self.assertDictEqual(r, {**r, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Denver",
|
||||
@@ -445,10 +453,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
@@ -456,12 +464,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
result_map = {r["id"]: r for r in results}
|
||||
self.assertEqual(set(result_map.keys()), {1, 3})
|
||||
for subset in [
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
]:
|
||||
r = result_map[subset["id"]]
|
||||
self.assertDictEqual(r, {**r, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="Europe/Sofia",
|
||||
@@ -501,10 +511,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Django converts dates to UTC
|
||||
d3.refresh_from_db()
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:20231201")
|
||||
results = response.data["results"]
|
||||
@@ -512,12 +522,8 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
# Expect 1 document returned
|
||||
self.assertEqual(len(results), 1)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
self.assertEqual(results[0]["id"], 3)
|
||||
self.assertEqual(results[0]["title"], "bank statement 3")
|
||||
|
||||
def test_search_added_invalid_date(self) -> None:
|
||||
"""
|
||||
@@ -526,7 +532,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
WHEN:
|
||||
- Query with invalid added date
|
||||
THEN:
|
||||
- No documents returned
|
||||
- 400 Bad Request returned (Tantivy rejects invalid date field syntax)
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
@@ -535,16 +541,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
pk=1,
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
get_backend().add_or_update(d1)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:invalid-date")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 0 document returned
|
||||
self.assertEqual(len(results), 0)
|
||||
# Tantivy rejects unparsable field queries with a 400
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
@mock.patch("documents.search._backend.TantivyBackend.autocomplete")
|
||||
def test_search_autocomplete_limits(self, m) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -556,7 +560,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
- Limit requests are obeyed
|
||||
"""
|
||||
|
||||
m.side_effect = lambda ix, term, limit, user: [term for _ in range(limit)]
|
||||
m.side_effect = lambda term, limit, user=None: [term for _ in range(limit)]
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
@@ -609,32 +613,29 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
owner=u1,
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=app")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"])
|
||||
self.assertEqual(response.data, ["applebaum", "apples", "appletini"])
|
||||
|
||||
d3.owner = u2
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d3)
|
||||
d3.save()
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=app")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data, [b"apples", b"applebaum"])
|
||||
self.assertEqual(response.data, ["applebaum", "apples"])
|
||||
|
||||
assign_perm("view_document", u1, d3)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d3)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=app")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"])
|
||||
self.assertEqual(response.data, ["applebaum", "apples", "appletini"])
|
||||
|
||||
def test_search_autocomplete_field_name_match(self) -> None:
|
||||
"""
|
||||
@@ -652,8 +653,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
checksum="1",
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
get_backend().add_or_update(d1)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=created:2023")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
@@ -674,33 +674,36 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
checksum="1",
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
get_backend().add_or_update(d1)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=auto")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data[0], b"auto")
|
||||
self.assertEqual(response.data[0], "auto")
|
||||
|
||||
def test_search_spelling_suggestion(self) -> None:
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content=f"Things document {i + 1}",
|
||||
)
|
||||
index.update_document(writer, doc)
|
||||
def test_search_no_spelling_suggestion(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Documents exist with various terms
|
||||
WHEN:
|
||||
- Query for documents with any term
|
||||
THEN:
|
||||
- corrected_query is always None (Tantivy has no spell correction)
|
||||
"""
|
||||
backend = get_backend()
|
||||
for i in range(5):
|
||||
doc = Document.objects.create(
|
||||
checksum=str(i),
|
||||
pk=i + 1,
|
||||
title=f"Document {i + 1}",
|
||||
content=f"Things document {i + 1}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
response = self.client.get("/api/documents/?query=thing")
|
||||
correction = response.data["corrected_query"]
|
||||
|
||||
self.assertEqual(correction, "things")
|
||||
self.assertIsNone(response.data["corrected_query"])
|
||||
|
||||
response = self.client.get("/api/documents/?query=things")
|
||||
correction = response.data["corrected_query"]
|
||||
|
||||
self.assertEqual(correction, None)
|
||||
self.assertIsNone(response.data["corrected_query"])
|
||||
|
||||
def test_search_spelling_suggestion_suppressed_for_private_terms(self):
|
||||
owner = User.objects.create_user("owner")
|
||||
@@ -709,24 +712,24 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
Permission.objects.get(codename="view_document"),
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
private_doc = Document.objects.create(
|
||||
checksum=f"p{i}",
|
||||
pk=100 + i,
|
||||
title=f"Private Document {i + 1}",
|
||||
content=f"treasury document {i + 1}",
|
||||
owner=owner,
|
||||
)
|
||||
visible_doc = Document.objects.create(
|
||||
checksum=f"v{i}",
|
||||
pk=200 + i,
|
||||
title=f"Visible Document {i + 1}",
|
||||
content=f"public ledger {i + 1}",
|
||||
owner=attacker,
|
||||
)
|
||||
index.update_document(writer, private_doc)
|
||||
index.update_document(writer, visible_doc)
|
||||
backend = get_backend()
|
||||
for i in range(5):
|
||||
private_doc = Document.objects.create(
|
||||
checksum=f"p{i}",
|
||||
pk=100 + i,
|
||||
title=f"Private Document {i + 1}",
|
||||
content=f"treasury document {i + 1}",
|
||||
owner=owner,
|
||||
)
|
||||
visible_doc = Document.objects.create(
|
||||
checksum=f"v{i}",
|
||||
pk=200 + i,
|
||||
title=f"Visible Document {i + 1}",
|
||||
content=f"public ledger {i + 1}",
|
||||
owner=attacker,
|
||||
)
|
||||
backend.add_or_update(private_doc)
|
||||
backend.add_or_update(visible_doc)
|
||||
|
||||
self.client.force_authenticate(user=attacker)
|
||||
|
||||
@@ -736,26 +739,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
self.assertIsNone(response.data["corrected_query"])
|
||||
|
||||
@mock.patch(
|
||||
"whoosh.searching.Searcher.correct_query",
|
||||
side_effect=Exception("Test error"),
|
||||
)
|
||||
def test_corrected_query_error(self, mock_correct_query) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A query that raises an error on correction
|
||||
WHEN:
|
||||
- API request for search with that query
|
||||
THEN:
|
||||
- The error is logged and the search proceeds
|
||||
"""
|
||||
with self.assertLogs("paperless.index", level="INFO") as cm:
|
||||
response = self.client.get("/api/documents/?query=2025-06-04")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
error_str = cm.output[0]
|
||||
expected_str = "Error while correcting query '2025-06-04': Test error"
|
||||
self.assertIn(expected_str, error_str)
|
||||
|
||||
def test_search_more_like(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -790,11 +773,11 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
pk=4,
|
||||
checksum="ABC",
|
||||
)
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
index.update_document(writer, d4)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
backend.add_or_update(d4)
|
||||
|
||||
response = self.client.get(f"/api/documents/?more_like_id={d2.id}")
|
||||
|
||||
@@ -802,9 +785,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
|
||||
results = response.data["results"]
|
||||
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]["id"], d3.id)
|
||||
self.assertEqual(results[1]["id"], d1.id)
|
||||
self.assertGreaterEqual(len(results), 1)
|
||||
result_ids = [r["id"] for r in results]
|
||||
self.assertIn(d3.id, result_ids)
|
||||
self.assertNotIn(d4.id, result_ids)
|
||||
|
||||
def test_search_more_like_requires_view_permission_on_seed_document(
|
||||
self,
|
||||
@@ -846,10 +830,10 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
pk=12,
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, private_seed)
|
||||
index.update_document(writer, visible_doc)
|
||||
index.update_document(writer, other_doc)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(private_seed)
|
||||
backend.add_or_update(visible_doc)
|
||||
backend.add_or_update(other_doc)
|
||||
|
||||
self.client.force_authenticate(user=attacker)
|
||||
|
||||
@@ -923,9 +907,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
value_text="foobard4",
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for doc in Document.objects.all():
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for doc in Document.objects.all():
|
||||
backend.add_or_update(doc)
|
||||
|
||||
def search_query(q):
|
||||
r = self.client.get("/api/documents/?query=test" + q)
|
||||
@@ -1141,9 +1125,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
Document.objects.create(checksum="3", content="test 3", owner=u2)
|
||||
Document.objects.create(checksum="4", content="test 4")
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for doc in Document.objects.all():
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for doc in Document.objects.all():
|
||||
backend.add_or_update(doc)
|
||||
|
||||
self.client.force_authenticate(user=u1)
|
||||
r = self.client.get("/api/documents/?query=test")
|
||||
@@ -1194,9 +1178,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
d3 = Document.objects.create(checksum="3", content="test 3", owner=u2)
|
||||
Document.objects.create(checksum="4", content="test 4")
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for doc in Document.objects.all():
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for doc in Document.objects.all():
|
||||
backend.add_or_update(doc)
|
||||
|
||||
self.client.force_authenticate(user=u1)
|
||||
r = self.client.get("/api/documents/?query=test")
|
||||
@@ -1216,9 +1200,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
assign_perm("view_document", u1, d3)
|
||||
assign_perm("view_document", u2, d1)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for doc in [d1, d2, d3]:
|
||||
index.update_document(writer, doc)
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
|
||||
self.client.force_authenticate(user=u1)
|
||||
r = self.client.get("/api/documents/?query=test")
|
||||
@@ -1281,9 +1265,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
user=u1,
|
||||
)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for doc in Document.objects.all():
|
||||
index.update_document(writer, doc)
|
||||
backend = get_backend()
|
||||
for doc in Document.objects.all():
|
||||
backend.add_or_update(doc)
|
||||
|
||||
def search_query(q):
|
||||
r = self.client.get("/api/documents/?query=test" + q)
|
||||
@@ -1316,13 +1300,14 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
search_query("&ordering=-num_notes"),
|
||||
[d1.id, d3.id, d2.id],
|
||||
)
|
||||
# owner sort: ORM orders by owner_id (integer); NULLs first in SQLite ASC
|
||||
self.assertListEqual(
|
||||
search_query("&ordering=owner"),
|
||||
[d1.id, d2.id, d3.id],
|
||||
[d3.id, d1.id, d2.id],
|
||||
)
|
||||
self.assertListEqual(
|
||||
search_query("&ordering=-owner"),
|
||||
[d3.id, d2.id, d1.id],
|
||||
[d2.id, d1.id, d3.id],
|
||||
)
|
||||
|
||||
@mock.patch("documents.bulk_edit.bulk_update_documents")
|
||||
@@ -1379,12 +1364,12 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
)
|
||||
set_permissions([4, 5], set_permissions={}, owner=user2, merge=False)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
index.update_document(writer, d4)
|
||||
index.update_document(writer, d5)
|
||||
backend = get_backend()
|
||||
backend.add_or_update(d1)
|
||||
backend.add_or_update(d2)
|
||||
backend.add_or_update(d3)
|
||||
backend.add_or_update(d4)
|
||||
backend.add_or_update(d5)
|
||||
|
||||
correspondent1 = Correspondent.objects.create(name="bank correspondent 1")
|
||||
Correspondent.objects.create(name="correspondent 2")
|
||||
|
||||
@@ -191,40 +191,42 @@ class TestSystemStatus(APITestCase):
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["celery_status"], "OK")
|
||||
|
||||
@override_settings(INDEX_DIR=Path("/tmp/index"))
|
||||
@mock.patch("whoosh.index.FileIndex.last_modified")
|
||||
def test_system_status_index_ok(self, mock_last_modified) -> None:
|
||||
@mock.patch("documents.search.get_backend")
|
||||
def test_system_status_index_ok(self, mock_get_backend) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- The index last modified time is set
|
||||
- The index is accessible
|
||||
WHEN:
|
||||
- The user requests the system status
|
||||
THEN:
|
||||
- The response contains the correct index status
|
||||
"""
|
||||
mock_last_modified.return_value = 1707839087
|
||||
self.client.force_login(self.user)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
mock_get_backend.return_value = mock.MagicMock()
|
||||
# Use the temp dir created in setUp (self.tmp_dir) as a real INDEX_DIR
|
||||
# with a real file so the mtime lookup works
|
||||
sentinel = self.tmp_dir / "sentinel.txt"
|
||||
sentinel.write_text("ok")
|
||||
with self.settings(INDEX_DIR=self.tmp_dir):
|
||||
self.client.force_login(self.user)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["index_status"], "OK")
|
||||
self.assertIsNotNone(response.data["tasks"]["index_last_modified"])
|
||||
|
||||
@override_settings(INDEX_DIR=Path("/tmp/index/"))
|
||||
@mock.patch("documents.index.open_index", autospec=True)
|
||||
def test_system_status_index_error(self, mock_open_index) -> None:
|
||||
@mock.patch("documents.search.get_backend")
|
||||
def test_system_status_index_error(self, mock_get_backend) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- The index is not found
|
||||
- The index cannot be opened
|
||||
WHEN:
|
||||
- The user requests the system status
|
||||
THEN:
|
||||
- The response contains the correct index status
|
||||
"""
|
||||
mock_open_index.return_value = None
|
||||
mock_open_index.side_effect = Exception("Index error")
|
||||
mock_get_backend.side_effect = Exception("Index error")
|
||||
self.client.force_login(self.user)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
mock_open_index.assert_called_once()
|
||||
mock_get_backend.assert_called_once()
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["index_status"], "ERROR")
|
||||
self.assertIsNotNone(response.data["tasks"]["index_error"])
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
from django.test import TestCase
|
||||
from whoosh import query
|
||||
|
||||
from documents.index import get_permissions_criterias
|
||||
from documents.models import User
|
||||
|
||||
|
||||
class TestDelayedQuery(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# all tests run without permission criteria, so has_no_owner query will always
|
||||
# be appended.
|
||||
self.has_no_owner = query.Or([query.Term("has_owner", text=False)])
|
||||
|
||||
def _get_testset__id__in(self, param, field):
|
||||
return (
|
||||
{f"{param}__id__in": "42,43"},
|
||||
query.And(
|
||||
[
|
||||
query.Or(
|
||||
[
|
||||
query.Term(f"{field}_id", "42"),
|
||||
query.Term(f"{field}_id", "43"),
|
||||
],
|
||||
),
|
||||
self.has_no_owner,
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
def _get_testset__id__none(self, param, field):
|
||||
return (
|
||||
{f"{param}__id__none": "42,43"},
|
||||
query.And(
|
||||
[
|
||||
query.Not(query.Term(f"{field}_id", "42")),
|
||||
query.Not(query.Term(f"{field}_id", "43")),
|
||||
self.has_no_owner,
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
def test_get_permission_criteria(self) -> None:
|
||||
# tests contains tuples of user instances and the expected filter
|
||||
tests = (
|
||||
(None, [query.Term("has_owner", text=False)]),
|
||||
(User(42, username="foo", is_superuser=True), []),
|
||||
(
|
||||
User(42, username="foo", is_superuser=False),
|
||||
[
|
||||
query.Term("has_owner", text=False),
|
||||
query.Term("owner_id", 42),
|
||||
query.Term("viewer_id", "42"),
|
||||
],
|
||||
),
|
||||
)
|
||||
for user, expected in tests:
|
||||
self.assertEqual(get_permissions_criterias(user), expected)
|
||||
@@ -1,371 +0,0 @@
|
||||
from datetime import datetime
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import SimpleTestCase
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from django.utils.timezone import timezone
|
||||
|
||||
from documents import index
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestAutoComplete(DirectoriesMixin, TestCase):
|
||||
def test_auto_complete(self) -> None:
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
)
|
||||
doc2 = Document.objects.create(title="doc2", checksum="B", content="test test2")
|
||||
doc3 = Document.objects.create(title="doc3", checksum="C", content="test2")
|
||||
|
||||
index.add_or_update_document(doc1)
|
||||
index.add_or_update_document(doc2)
|
||||
index.add_or_update_document(doc3)
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
self.assertListEqual(
|
||||
index.autocomplete(ix, "tes"),
|
||||
[b"test2", b"test", b"test3"],
|
||||
)
|
||||
self.assertListEqual(
|
||||
index.autocomplete(ix, "tes", limit=3),
|
||||
[b"test2", b"test", b"test3"],
|
||||
)
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test2"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
|
||||
|
||||
def test_archive_serial_number_ranging(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with an archive serial number above schema allowed size
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- Error is logged
|
||||
- Document ASN is reset to 0 for the index
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
# yes, this is allowed, unless full_clean is run
|
||||
# DRF does call the validators, this test won't
|
||||
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
|
||||
)
|
||||
with self.assertLogs("paperless.index", level="ERROR") as cm:
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertEqual(kwargs["asn"], 0)
|
||||
|
||||
error_str = cm.output[0]
|
||||
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
|
||||
self.assertIn(expected_str, error_str)
|
||||
|
||||
def test_archive_serial_number_is_none(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with no archive serial number
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- ASN isn't touched
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
)
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertIsNone(kwargs["asn"])
|
||||
|
||||
@override_settings(TIME_ZONE="Pacific/Auckland")
|
||||
def test_added_today_respects_local_timezone_boundary(self) -> None:
|
||||
tz = get_current_timezone()
|
||||
fixed_now = datetime(2025, 7, 20, 15, 0, 0, tzinfo=tz)
|
||||
|
||||
# Fake a time near the local boundary (1 AM NZT = 13:00 UTC on previous UTC day)
|
||||
local_dt = datetime(2025, 7, 20, 1, 0, 0).replace(tzinfo=tz)
|
||||
utc_dt = local_dt.astimezone(timezone.utc)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Time zone",
|
||||
content="Testing added:today",
|
||||
checksum="edgecase123",
|
||||
added=utc_dt,
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, doc)
|
||||
|
||||
superuser = User.objects.create_superuser(username="testuser")
|
||||
self.client.force_login(superuser)
|
||||
|
||||
with mock.patch("documents.index.now", return_value=fixed_now):
|
||||
response = self.client.get("/api/documents/?query=added:today")
|
||||
results = response.json()["results"]
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]["id"], doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:yesterday")
|
||||
results = response.json()["results"]
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
|
||||
@override_settings(TIME_ZONE="UTC")
|
||||
class TestRewriteNaturalDateKeywords(SimpleTestCase):
|
||||
"""
|
||||
Unit tests for rewrite_natural_date_keywords function.
|
||||
"""
|
||||
|
||||
def _rewrite_with_now(self, query: str, now_dt: datetime) -> str:
|
||||
with mock.patch("documents.index.now", return_value=now_dt):
|
||||
return index.rewrite_natural_date_keywords(query)
|
||||
|
||||
def _assert_rewrite_contains(
|
||||
self,
|
||||
query: str,
|
||||
now_dt: datetime,
|
||||
*expected_fragments: str,
|
||||
) -> str:
|
||||
result = self._rewrite_with_now(query, now_dt)
|
||||
for fragment in expected_fragments:
|
||||
self.assertIn(fragment, result)
|
||||
return result
|
||||
|
||||
def test_range_keywords(self) -> None:
|
||||
"""
|
||||
Test various different range keywords
|
||||
"""
|
||||
cases = [
|
||||
(
|
||||
"added:today",
|
||||
datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc),
|
||||
("added:[20250720", "TO 20250720"),
|
||||
),
|
||||
(
|
||||
"added:yesterday",
|
||||
datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc),
|
||||
("added:[20250719", "TO 20250719"),
|
||||
),
|
||||
(
|
||||
"added:this month",
|
||||
datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20250701", "TO 20250731"),
|
||||
),
|
||||
(
|
||||
"added:previous month",
|
||||
datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20250601", "TO 20250630"),
|
||||
),
|
||||
(
|
||||
"added:this year",
|
||||
datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20250101", "TO 20251231"),
|
||||
),
|
||||
(
|
||||
"added:previous year",
|
||||
datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20240101", "TO 20241231"),
|
||||
),
|
||||
# Previous quarter from July 15, 2025 is April-June.
|
||||
(
|
||||
"added:previous quarter",
|
||||
datetime(2025, 7, 15, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20250401", "TO 20250630"),
|
||||
),
|
||||
# July 20, 2025 is a Sunday (weekday 6) so previous week is July 7-13.
|
||||
(
|
||||
"added:previous week",
|
||||
datetime(2025, 7, 20, 12, 0, 0, tzinfo=timezone.utc),
|
||||
("added:[20250707", "TO 20250713"),
|
||||
),
|
||||
]
|
||||
|
||||
for query, now_dt, fragments in cases:
|
||||
with self.subTest(query=query):
|
||||
self._assert_rewrite_contains(query, now_dt, *fragments)
|
||||
|
||||
def test_additional_fields(self) -> None:
|
||||
fixed_now = datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc)
|
||||
# created
|
||||
self._assert_rewrite_contains("created:today", fixed_now, "created:[20250720")
|
||||
# modified
|
||||
self._assert_rewrite_contains("modified:today", fixed_now, "modified:[20250720")
|
||||
|
||||
def test_basic_syntax_variants(self) -> None:
|
||||
"""
|
||||
Test that quoting, casing, and multi-clause queries are parsed.
|
||||
"""
|
||||
fixed_now = datetime(2025, 7, 20, 15, 30, 45, tzinfo=timezone.utc)
|
||||
|
||||
# quoted keywords
|
||||
result1 = self._rewrite_with_now('added:"today"', fixed_now)
|
||||
result2 = self._rewrite_with_now("added:'today'", fixed_now)
|
||||
self.assertIn("added:[20250720", result1)
|
||||
self.assertIn("added:[20250720", result2)
|
||||
|
||||
# case insensitivity
|
||||
for query in ("added:TODAY", "added:Today", "added:ToDaY"):
|
||||
with self.subTest(case_variant=query):
|
||||
self._assert_rewrite_contains(query, fixed_now, "added:[20250720")
|
||||
|
||||
# multiple clauses
|
||||
result = self._rewrite_with_now("added:today created:yesterday", fixed_now)
|
||||
self.assertIn("added:[20250720", result)
|
||||
self.assertIn("created:[20250719", result)
|
||||
|
||||
def test_no_match(self) -> None:
|
||||
"""
|
||||
Test that queries without keywords are unchanged.
|
||||
"""
|
||||
query = "title:test content:example"
|
||||
result = index.rewrite_natural_date_keywords(query)
|
||||
self.assertEqual(query, result)
|
||||
|
||||
@override_settings(TIME_ZONE="Pacific/Auckland")
|
||||
def test_timezone_awareness(self) -> None:
|
||||
"""
|
||||
Test timezone conversion.
|
||||
"""
|
||||
# July 20, 2025 1:00 AM NZST = July 19, 2025 13:00 UTC
|
||||
fixed_now = datetime(2025, 7, 20, 1, 0, 0, tzinfo=get_current_timezone())
|
||||
result = self._rewrite_with_now("added:today", fixed_now)
|
||||
# Should convert to UTC properly
|
||||
self.assertIn("added:[20250719", result)
|
||||
|
||||
|
||||
class TestIndexResilience(DirectoriesMixin, SimpleTestCase):
|
||||
def _assert_recreate_called(self, mock_create_in) -> None:
|
||||
mock_create_in.assert_called_once()
|
||||
path_arg, schema_arg = mock_create_in.call_args.args
|
||||
self.assertEqual(path_arg, settings.INDEX_DIR)
|
||||
self.assertEqual(schema_arg.__class__.__name__, "Schema")
|
||||
|
||||
def test_transient_missing_segment_does_not_force_recreate(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Index directory exists
|
||||
WHEN:
|
||||
- open_index is called
|
||||
- Opening the index raises FileNotFoundError once due to a
|
||||
transient missing segment
|
||||
THEN:
|
||||
- Index is opened successfully on retry
|
||||
- Index is not recreated
|
||||
"""
|
||||
file_marker = settings.INDEX_DIR / "file_marker.txt"
|
||||
file_marker.write_text("keep")
|
||||
expected_index = object()
|
||||
|
||||
with (
|
||||
mock.patch("documents.index.exists_in", return_value=True),
|
||||
mock.patch(
|
||||
"documents.index.open_dir",
|
||||
side_effect=[FileNotFoundError("missing"), expected_index],
|
||||
) as mock_open_dir,
|
||||
mock.patch(
|
||||
"documents.index.create_in",
|
||||
) as mock_create_in,
|
||||
mock.patch(
|
||||
"documents.index.rmtree",
|
||||
) as mock_rmtree,
|
||||
):
|
||||
ix = index.open_index()
|
||||
|
||||
self.assertIs(ix, expected_index)
|
||||
self.assertGreaterEqual(mock_open_dir.call_count, 2)
|
||||
mock_rmtree.assert_not_called()
|
||||
mock_create_in.assert_not_called()
|
||||
self.assertEqual(file_marker.read_text(), "keep")
|
||||
|
||||
def test_transient_errors_exhaust_retries_and_recreate(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Index directory exists
|
||||
WHEN:
|
||||
- open_index is called
|
||||
- Opening the index raises FileNotFoundError multiple times due to
|
||||
transient missing segments
|
||||
THEN:
|
||||
- Index is recreated after retries are exhausted
|
||||
"""
|
||||
recreated_index = object()
|
||||
|
||||
with (
|
||||
self.assertLogs("paperless.index", level="ERROR") as cm,
|
||||
mock.patch("documents.index.exists_in", return_value=True),
|
||||
mock.patch(
|
||||
"documents.index.open_dir",
|
||||
side_effect=FileNotFoundError("missing"),
|
||||
) as mock_open_dir,
|
||||
mock.patch("documents.index.rmtree") as mock_rmtree,
|
||||
mock.patch(
|
||||
"documents.index.create_in",
|
||||
return_value=recreated_index,
|
||||
) as mock_create_in,
|
||||
):
|
||||
ix = index.open_index()
|
||||
|
||||
self.assertIs(ix, recreated_index)
|
||||
self.assertEqual(mock_open_dir.call_count, 4)
|
||||
mock_rmtree.assert_called_once_with(settings.INDEX_DIR)
|
||||
self._assert_recreate_called(mock_create_in)
|
||||
self.assertIn(
|
||||
"Error while opening the index after retries, recreating.",
|
||||
cm.output[0],
|
||||
)
|
||||
|
||||
def test_non_transient_error_recreates_index(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Index directory exists
|
||||
WHEN:
|
||||
- open_index is called
|
||||
- Opening the index raises a "non-transient" error
|
||||
THEN:
|
||||
- Index is recreated
|
||||
"""
|
||||
recreated_index = object()
|
||||
|
||||
with (
|
||||
self.assertLogs("paperless.index", level="ERROR") as cm,
|
||||
mock.patch("documents.index.exists_in", return_value=True),
|
||||
mock.patch(
|
||||
"documents.index.open_dir",
|
||||
side_effect=RuntimeError("boom"),
|
||||
),
|
||||
mock.patch("documents.index.rmtree") as mock_rmtree,
|
||||
mock.patch(
|
||||
"documents.index.create_in",
|
||||
return_value=recreated_index,
|
||||
) as mock_create_in,
|
||||
):
|
||||
ix = index.open_index()
|
||||
|
||||
self.assertIs(ix, recreated_index)
|
||||
mock_rmtree.assert_called_once_with(settings.INDEX_DIR)
|
||||
self._assert_recreate_called(mock_create_in)
|
||||
self.assertIn(
|
||||
"Error while opening the index, recreating.",
|
||||
cm.output[0],
|
||||
)
|
||||
@@ -452,7 +452,10 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
"""
|
||||
|
||||
def setUp(self) -> None:
|
||||
from documents.search import reset_backend
|
||||
|
||||
TestCase.setUp(self)
|
||||
reset_backend()
|
||||
User.objects.create_user(username="test_consumer", password="12345")
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.",
|
||||
@@ -464,6 +467,9 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
override_settings(INDEX_DIR=self.index_dir).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
from documents.search import reset_backend
|
||||
|
||||
reset_backend()
|
||||
shutil.rmtree(self.index_dir, ignore_errors=True)
|
||||
|
||||
def test_tag_applied_any(self) -> None:
|
||||
|
||||
@@ -208,10 +208,12 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
|
||||
with mock.patch("documents.index.add_or_update_document") as add:
|
||||
with mock.patch("documents.search.get_backend") as mock_get_backend:
|
||||
mock_backend = mock.MagicMock()
|
||||
mock_get_backend.return_value = mock_backend
|
||||
add_to_index(sender=None, document=root)
|
||||
|
||||
add.assert_called_once_with(root)
|
||||
mock_backend.add_or_update.assert_called_once_with(root)
|
||||
|
||||
def test_add_to_index_reindexes_root_for_version_documents(self) -> None:
|
||||
root = Document.objects.create(
|
||||
@@ -226,13 +228,21 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
|
||||
root_document=root,
|
||||
)
|
||||
|
||||
with mock.patch("documents.index.add_or_update_document") as add:
|
||||
with mock.patch("documents.search.get_backend") as mock_get_backend:
|
||||
mock_backend = mock.MagicMock()
|
||||
mock_get_backend.return_value = mock_backend
|
||||
add_to_index(sender=None, document=version)
|
||||
|
||||
self.assertEqual(add.call_count, 2)
|
||||
self.assertEqual(add.call_args_list[0].args[0].id, version.id)
|
||||
self.assertEqual(add.call_args_list[1].args[0].id, root.id)
|
||||
self.assertEqual(mock_backend.add_or_update.call_count, 2)
|
||||
self.assertEqual(
|
||||
add.call_args_list[1].kwargs,
|
||||
mock_backend.add_or_update.call_args_list[0].args[0].id,
|
||||
version.id,
|
||||
)
|
||||
self.assertEqual(
|
||||
mock_backend.add_or_update.call_args_list[1].args[0].id,
|
||||
root.id,
|
||||
)
|
||||
self.assertEqual(
|
||||
mock_backend.add_or_update.call_args_list[1].kwargs,
|
||||
{"effective_content": version.content},
|
||||
)
|
||||
|
||||
@@ -157,11 +157,17 @@ class DirectoriesMixin:
|
||||
"""
|
||||
|
||||
def setUp(self) -> None:
|
||||
from documents.search import reset_backend
|
||||
|
||||
reset_backend()
|
||||
self.dirs = setup_directories()
|
||||
super().setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
from documents.search import reset_backend
|
||||
|
||||
super().tearDown()
|
||||
reset_backend()
|
||||
remove_dirs(self.dirs)
|
||||
|
||||
|
||||
|
||||
@@ -100,7 +100,6 @@ from rest_framework.viewsets import ReadOnlyModelViewSet
|
||||
from rest_framework.viewsets import ViewSet
|
||||
|
||||
from documents import bulk_edit
|
||||
from documents import index
|
||||
from documents.bulk_download import ArchiveOnlyStrategy
|
||||
from documents.bulk_download import OriginalAndArchiveStrategy
|
||||
from documents.bulk_download import OriginalsOnlyStrategy
|
||||
@@ -972,9 +971,9 @@ class DocumentViewSet(
|
||||
response_data["content"] = content_doc.content
|
||||
response = Response(response_data)
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.add_or_update_document(refreshed_doc)
|
||||
get_backend().add_or_update(refreshed_doc)
|
||||
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
@@ -984,9 +983,9 @@ class DocumentViewSet(
|
||||
return response
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.remove_document_from_index(self.get_object())
|
||||
get_backend().remove(self.get_object().pk)
|
||||
try:
|
||||
return super().destroy(request, *args, **kwargs)
|
||||
except Exception as e:
|
||||
@@ -1393,9 +1392,9 @@ class DocumentViewSet(
|
||||
doc.modified = timezone.now()
|
||||
doc.save()
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.add_or_update_document(doc)
|
||||
get_backend().add_or_update(doc)
|
||||
|
||||
notes = serializer.to_representation(doc).get("notes")
|
||||
|
||||
@@ -1430,9 +1429,9 @@ class DocumentViewSet(
|
||||
doc.modified = timezone.now()
|
||||
doc.save()
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.add_or_update_document(doc)
|
||||
get_backend().add_or_update(doc)
|
||||
|
||||
notes = serializer.to_representation(doc).get("notes")
|
||||
|
||||
@@ -1744,12 +1743,13 @@ class DocumentViewSet(
|
||||
"Cannot delete the root/original version. Delete the document instead.",
|
||||
)
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
index.remove_document_from_index(version_doc)
|
||||
_backend = get_backend()
|
||||
_backend.remove(version_doc.pk)
|
||||
version_doc_id = version_doc.id
|
||||
version_doc.delete()
|
||||
index.add_or_update_document(root_doc)
|
||||
_backend.add_or_update(root_doc)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
actor = (
|
||||
request.user if request.user and request.user.is_authenticated else None
|
||||
@@ -1949,10 +1949,6 @@ class ChatStreamingView(GenericAPIView):
|
||||
),
|
||||
)
|
||||
class UnifiedSearchViewSet(DocumentViewSet):
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.searcher = None
|
||||
|
||||
def get_serializer_class(self):
|
||||
if self._is_search_request():
|
||||
return SearchResultSerializer
|
||||
@@ -1965,17 +1961,34 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
or "more_like_id" in self.request.query_params
|
||||
)
|
||||
|
||||
def filter_queryset(self, queryset):
|
||||
filtered_queryset = super().filter_queryset(queryset)
|
||||
def list(self, request, *args, **kwargs):
|
||||
if not self._is_search_request():
|
||||
return super().list(request)
|
||||
|
||||
if self._is_search_request():
|
||||
if "query" in self.request.query_params:
|
||||
from documents import index
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
|
||||
query_class = index.DelayedFullTextQuery
|
||||
elif "more_like_id" in self.request.query_params:
|
||||
try:
|
||||
backend = get_backend()
|
||||
# ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied)
|
||||
filtered_qs = self.filter_queryset(self.get_queryset())
|
||||
|
||||
user = None if request.user.is_superuser else request.user
|
||||
|
||||
if "query" in request.query_params:
|
||||
query_str = request.query_params["query"]
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
else:
|
||||
# more_like_id — validate permission on the seed document first
|
||||
try:
|
||||
more_like_doc_id = int(self.request.query_params["more_like_id"])
|
||||
more_like_doc_id = int(request.query_params["more_like_id"])
|
||||
more_like_doc = Document.objects.select_related("owner").get(
|
||||
pk=more_like_doc_id,
|
||||
)
|
||||
@@ -1983,61 +1996,62 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
raise PermissionDenied(_("Invalid more_like_id"))
|
||||
|
||||
if not has_perms_owner_aware(
|
||||
self.request.user,
|
||||
request.user,
|
||||
"view_document",
|
||||
more_like_doc,
|
||||
):
|
||||
raise PermissionDenied(_("Insufficient permissions."))
|
||||
|
||||
from documents import index
|
||||
|
||||
query_class = index.DelayedMoreLikeThisQuery
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
return query_class(
|
||||
self.searcher,
|
||||
self.request.query_params,
|
||||
self.paginator.get_page_size(self.request),
|
||||
filter_queryset=filtered_queryset,
|
||||
)
|
||||
else:
|
||||
return filtered_queryset
|
||||
|
||||
def list(self, request, *args, **kwargs):
|
||||
if self._is_search_request():
|
||||
from documents import index
|
||||
|
||||
try:
|
||||
with index.open_index_searcher() as s:
|
||||
self.searcher = s
|
||||
queryset = self.filter_queryset(self.get_queryset())
|
||||
page = self.paginate_queryset(queryset)
|
||||
|
||||
serializer = self.get_serializer(page, many=True)
|
||||
response = self.get_paginated_response(serializer.data)
|
||||
|
||||
response.data["corrected_query"] = (
|
||||
queryset.suggested_correction
|
||||
if hasattr(queryset, "suggested_correction")
|
||||
else None
|
||||
)
|
||||
|
||||
return response
|
||||
except NotFound:
|
||||
raise
|
||||
except PermissionDenied as e:
|
||||
invalid_more_like_id_message = _("Invalid more_like_id")
|
||||
if str(e.detail) == str(invalid_more_like_id_message):
|
||||
return HttpResponseForbidden(invalid_more_like_id_message)
|
||||
return HttpResponseForbidden(_("Insufficient permissions."))
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
"Error listing search results, check logs for more detail.",
|
||||
results = backend.more_like_this(
|
||||
more_like_doc_id,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
)
|
||||
else:
|
||||
return super().list(request)
|
||||
|
||||
hits_by_id = {h["id"]: h for h in results.hits}
|
||||
|
||||
# Determine sort order: no ordering param → Tantivy relevance; otherwise → ORM order
|
||||
ordering_param = request.query_params.get("ordering", "").lstrip("-")
|
||||
if not ordering_param:
|
||||
# Preserve Tantivy relevance order; intersect with ORM-visible IDs
|
||||
orm_ids = set(filtered_qs.values_list("pk", flat=True))
|
||||
ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
|
||||
else:
|
||||
# Use ORM ordering (already applied by DocumentsOrderingFilter)
|
||||
hit_ids = set(hits_by_id.keys())
|
||||
orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
|
||||
"pk",
|
||||
flat=True,
|
||||
)
|
||||
ordered_hits = [
|
||||
hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id
|
||||
]
|
||||
|
||||
rl = TantivyRelevanceList(ordered_hits)
|
||||
page = self.paginate_queryset(rl)
|
||||
|
||||
if page is not None:
|
||||
serializer = self.get_serializer(page, many=True)
|
||||
response = self.get_paginated_response(serializer.data)
|
||||
response.data["corrected_query"] = None
|
||||
return response
|
||||
|
||||
serializer = self.get_serializer(ordered_hits, many=True)
|
||||
return Response(serializer.data)
|
||||
|
||||
except NotFound:
|
||||
raise
|
||||
except PermissionDenied as e:
|
||||
invalid_more_like_id_message = _("Invalid more_like_id")
|
||||
if str(e.detail) == str(invalid_more_like_id_message):
|
||||
return HttpResponseForbidden(invalid_more_like_id_message)
|
||||
return HttpResponseForbidden(_("Insufficient permissions."))
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
"Error listing search results, check logs for more detail.",
|
||||
)
|
||||
|
||||
@action(detail=False, methods=["GET"], name="Get Next ASN")
|
||||
def next_asn(self, request, *args, **kwargs):
|
||||
@@ -2816,18 +2830,9 @@ class SearchAutoCompleteView(GenericAPIView):
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
from documents import index
|
||||
from documents.search import get_backend
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
return Response(
|
||||
index.autocomplete(
|
||||
ix,
|
||||
term,
|
||||
limit,
|
||||
user,
|
||||
),
|
||||
)
|
||||
return Response(get_backend().autocomplete(term, limit, user))
|
||||
|
||||
|
||||
@extend_schema_view(
|
||||
@@ -2893,20 +2898,21 @@ class GlobalSearchView(PassUserMixin):
|
||||
# First search by title
|
||||
docs = all_docs.filter(title__icontains=query)
|
||||
if not db_only and len(docs) < OBJECT_LIMIT:
|
||||
# If we don't have enough results, search by content
|
||||
from documents import index
|
||||
# If we don't have enough results, search by content.
|
||||
# Over-fetch from Tantivy (no permission filter) and rely on
|
||||
# the ORM all_docs queryset for authoritative permission gating.
|
||||
from documents.search import get_backend
|
||||
|
||||
with index.open_index_searcher() as s:
|
||||
fts_query = index.DelayedFullTextQuery(
|
||||
s,
|
||||
request.query_params,
|
||||
OBJECT_LIMIT,
|
||||
filter_queryset=all_docs,
|
||||
)
|
||||
results = fts_query[0:1]
|
||||
docs = docs | Document.objects.filter(
|
||||
id__in=[r["id"] for r in results],
|
||||
)
|
||||
fts_results = get_backend().search(
|
||||
query,
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=1000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
fts_ids = {h["id"] for h in fts_results.hits}
|
||||
docs = docs | all_docs.filter(id__in=fts_ids)
|
||||
docs = docs[:OBJECT_LIMIT]
|
||||
saved_views = (
|
||||
get_objects_for_user_owner_aware(
|
||||
@@ -4105,10 +4111,16 @@ class SystemStatusView(PassUserMixin):
|
||||
|
||||
index_error = None
|
||||
try:
|
||||
ix = index.open_index()
|
||||
from documents.search import get_backend
|
||||
|
||||
get_backend() # triggers open/rebuild; raises on error
|
||||
index_status = "OK"
|
||||
index_last_modified = make_aware(
|
||||
datetime.fromtimestamp(ix.last_modified()),
|
||||
# Use the most-recently modified file in the index directory as a proxy
|
||||
# for last index write time (Tantivy has no single last_modified() call).
|
||||
index_dir = settings.INDEX_DIR
|
||||
mtimes = [p.stat().st_mtime for p in index_dir.iterdir() if p.is_file()]
|
||||
index_last_modified = (
|
||||
make_aware(datetime.fromtimestamp(max(mtimes))) if mtimes else None
|
||||
)
|
||||
except Exception as e:
|
||||
index_status = "ERROR"
|
||||
|
||||
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
|
||||
from paperless.settings.custom import parse_ignore_dates
|
||||
from paperless.settings.custom import parse_redis_url
|
||||
from paperless.settings.parsers import get_bool_from_env
|
||||
from paperless.settings.parsers import get_choice_from_env
|
||||
from paperless.settings.parsers import get_float_from_env
|
||||
from paperless.settings.parsers import get_int_from_env
|
||||
from paperless.settings.parsers import get_list_from_env
|
||||
@@ -85,6 +86,11 @@ EMPTY_TRASH_DIR = (
|
||||
# threads.
|
||||
MEDIA_LOCK = MEDIA_ROOT / "media.lock"
|
||||
INDEX_DIR = DATA_DIR / "index"
|
||||
|
||||
ADVANCED_FUZZY_SEARCH_THRESHOLD: float | None = get_float_from_env(
|
||||
"PAPERLESS_ADVANCED_FUZZY_SEARCH_THRESHOLD",
|
||||
)
|
||||
|
||||
MODEL_FILE = get_path_from_env(
|
||||
"PAPERLESS_MODEL_FILE",
|
||||
DATA_DIR / "classification_model.pickle",
|
||||
@@ -1033,10 +1039,55 @@ def _get_nltk_language_setting(ocr_lang: str) -> str | None:
|
||||
return iso_code_to_nltk.get(ocr_lang)
|
||||
|
||||
|
||||
def _get_search_language_setting(ocr_lang: str) -> str | None:
|
||||
"""
|
||||
Determine the Tantivy stemmer language.
|
||||
|
||||
If PAPERLESS_SEARCH_LANGUAGE is explicitly set, it is validated against
|
||||
the languages supported by Tantivy's built-in stemmer and returned as-is.
|
||||
Otherwise the primary Tesseract language code from PAPERLESS_OCR_LANGUAGE
|
||||
is mapped to the corresponding ISO 639-1 code understood by Tantivy.
|
||||
Returns None when unset and the OCR language has no Tantivy stemmer.
|
||||
"""
|
||||
explicit = os.environ.get("PAPERLESS_SEARCH_LANGUAGE")
|
||||
if explicit is not None:
|
||||
# Lazy import avoids any app-loading order concerns; _tokenizer has no
|
||||
# Django dependencies so this is safe.
|
||||
from documents.search._tokenizer import SUPPORTED_LANGUAGES
|
||||
|
||||
return get_choice_from_env("PAPERLESS_SEARCH_LANGUAGE", SUPPORTED_LANGUAGES)
|
||||
|
||||
# Infer from the primary Tesseract language code (ISO 639-2/T → ISO 639-1)
|
||||
primary = ocr_lang.split("+", maxsplit=1)[0].lower()
|
||||
_ocr_to_search: dict[str, str] = {
|
||||
"ara": "ar",
|
||||
"dan": "da",
|
||||
"nld": "nl",
|
||||
"eng": "en",
|
||||
"fin": "fi",
|
||||
"fra": "fr",
|
||||
"deu": "de",
|
||||
"ell": "el",
|
||||
"hun": "hu",
|
||||
"ita": "it",
|
||||
"nor": "no",
|
||||
"por": "pt",
|
||||
"ron": "ro",
|
||||
"rus": "ru",
|
||||
"spa": "es",
|
||||
"swe": "sv",
|
||||
"tam": "ta",
|
||||
"tur": "tr",
|
||||
}
|
||||
return _ocr_to_search.get(primary)
|
||||
|
||||
|
||||
NLTK_ENABLED: Final[bool] = get_bool_from_env("PAPERLESS_ENABLE_NLTK", "yes")
|
||||
|
||||
NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE)
|
||||
|
||||
SEARCH_LANGUAGE: str | None = _get_search_language_setting(OCR_LANGUAGE)
|
||||
|
||||
###############################################################################
|
||||
# Email Preprocessors #
|
||||
###############################################################################
|
||||
|
||||
@@ -35,7 +35,6 @@ from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.viewsets import ModelViewSet
|
||||
|
||||
from documents.index import DelayedQuery
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from documents.tasks import llmindex_index
|
||||
from paperless.filters import GroupFilterSet
|
||||
@@ -72,20 +71,12 @@ class StandardPagination(PageNumberPagination):
|
||||
)
|
||||
|
||||
def get_all_result_ids(self):
|
||||
from documents.search import TantivyRelevanceList
|
||||
|
||||
query = self.page.paginator.object_list
|
||||
if isinstance(query, DelayedQuery):
|
||||
try:
|
||||
ids = [
|
||||
query.searcher.ixreader.stored_fields(
|
||||
doc_num,
|
||||
)["id"]
|
||||
for doc_num in query.saved_results.get(0).results.docs()
|
||||
]
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
ids = self.page.paginator.object_list.values_list("pk", flat=True)
|
||||
return ids
|
||||
if isinstance(query, TantivyRelevanceList):
|
||||
return [h["id"] for h in query._hits]
|
||||
return self.page.paginator.object_list.values_list("pk", flat=True)
|
||||
|
||||
def get_paginated_response_schema(self, schema):
|
||||
response_schema = super().get_paginated_response_schema(schema)
|
||||
|
||||
113
uv.lock
generated
113
uv.lock
generated
@@ -350,15 +350,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/73/3183c9e41ca755713bdf2cc1d0810df742c09484e2e1ddd693bee53877c1/brotli-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2d085ded05278d1c7f65560aae97b3160aeb2ea2c0b3e26204856beccb60888", size = 1488164, upload-time = "2025-11-05T18:38:53.079Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cached-property"
|
||||
version = "2.0.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/76/4b/3d870836119dbe9a5e3c9a61af8cc1a8b69d75aea564572e385882d5aefb/cached_property-2.0.1.tar.gz", hash = "sha256:484d617105e3ee0e4f1f58725e72a8ef9e93deee462222dbd51cd91230897641", size = 10574, upload-time = "2024-10-25T15:43:55.667Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/11/0e/7d8225aab3bc1a0f5811f8e1b557aa034ac04bdf641925b30d3caf586b28/cached_property-2.0.1-py3-none-any.whl", hash = "sha256:f617d70ab1100b7bcf6e42228f9ddcb78c676ffa167278d9f730d1c2fba69ccb", size = 7428, upload-time = "2024-10-25T15:43:54.711Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cbor2"
|
||||
version = "5.9.0"
|
||||
@@ -2910,12 +2901,12 @@ dependencies = [
|
||||
{ name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "sentence-transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "tantivy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
||||
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" },
|
||||
{ name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "zxing-cpp", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
|
||||
@@ -2951,6 +2942,7 @@ dev = [
|
||||
{ name = "pytest-sugar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest-xdist", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "ruff", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "time-machine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "zensical", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
docs = [
|
||||
@@ -2974,6 +2966,7 @@ testing = [
|
||||
{ name = "pytest-rerunfailures", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest-sugar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest-xdist", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "time-machine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
typing = [
|
||||
{ name = "celery-types", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -3064,11 +3057,11 @@ requires-dist = [
|
||||
{ name = "scikit-learn", specifier = "~=1.8.0" },
|
||||
{ name = "sentence-transformers", specifier = ">=4.1" },
|
||||
{ name = "setproctitle", specifier = "~=1.3.4" },
|
||||
{ name = "tantivy", specifier = ">=0.25.1" },
|
||||
{ name = "tika-client", specifier = "~=0.10.0" },
|
||||
{ name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
||||
{ name = "watchfiles", specifier = ">=1.1.1" },
|
||||
{ name = "whitenoise", specifier = "~=6.11" },
|
||||
{ name = "whoosh-reloaded", specifier = ">=2.7.5" },
|
||||
{ name = "zxing-cpp", specifier = "~=3.0.0" },
|
||||
]
|
||||
provides-extras = ["mariadb", "postgres", "webserver"]
|
||||
@@ -3090,6 +3083,7 @@ dev = [
|
||||
{ name = "pytest-sugar" },
|
||||
{ name = "pytest-xdist", specifier = "~=3.8.0" },
|
||||
{ name = "ruff", specifier = "~=0.15.0" },
|
||||
{ name = "time-machine", specifier = ">=2.13" },
|
||||
{ name = "zensical", specifier = ">=0.0.21" },
|
||||
]
|
||||
docs = [{ name = "zensical", specifier = ">=0.0.21" }]
|
||||
@@ -3111,6 +3105,7 @@ testing = [
|
||||
{ name = "pytest-rerunfailures", specifier = "~=16.1" },
|
||||
{ name = "pytest-sugar" },
|
||||
{ name = "pytest-xdist", specifier = "~=3.8.0" },
|
||||
{ name = "time-machine", specifier = ">=2.13" },
|
||||
]
|
||||
typing = [
|
||||
{ name = "celery-types" },
|
||||
@@ -4664,6 +4659,34 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy"
|
||||
version = "0.25.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1b/f9/0cd3955d155d3e3ef74b864769514dd191e5dacba9f0beb7af2d914942ce/tantivy-0.25.1.tar.gz", hash = "sha256:68a3314699a7d18fcf338b52bae8ce46a97dde1128a3e47e33fa4db7f71f265e", size = 75120, upload-time = "2025-12-02T11:57:12.997Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/7a/8a277f377e8a151fc0e71d4ffc1114aefb6e5e1c7dd609fed0955cf34ed8/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:d363d7b4207d3a5aa7f0d212420df35bed18bdb6bae26a2a8bd57428388b7c29", size = 7637033, upload-time = "2025-12-02T11:56:18.104Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/31/8b4acdedfc9f9a2d04b1340d07eef5213d6f151d1e18da0cb423e5f090d2/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8f4389cf1d889a1df7c5a3195806b4b56c37cee10d8a26faaa0dea35a867b5ff", size = 3932180, upload-time = "2025-12-02T11:56:19.833Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/dc/3e8499c21b4b9795e8f2fc54c68ce5b92905aaeadadaa56ecfa9180b11b1/tantivy-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99864c09fc54652c3c2486cdf13f86cdc8200f4b481569cb291e095ca5d496e5", size = 4197620, upload-time = "2025-12-02T11:56:21.496Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/8e/f2ce62fffc811eb62bead92c7b23c2e218f817cbd54c4f3b802e03ba1438/tantivy-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05abf37ddbc5063c575548be0d62931629c086bff7a5a1b67cf5a8f5ebf4cd8c", size = 4183794, upload-time = "2025-12-02T11:56:23.215Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/e7/6849c713ed0996c7628324c60512c4882006f0a62145e56c624a93407f90/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:90fd919e5f611809f746560ecf36eb9be824dec62e21ae17a27243759edb9aa1", size = 7621494, upload-time = "2025-12-02T11:56:27.069Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/22/c3d8294600dc6e7fa350daef9ff337d3c06e132b81df727de9f7a50c692a/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4613c7cf6c23f3a97989819690a0f956d799354957de7a204abcc60083cebe02", size = 3925219, upload-time = "2025-12-02T11:56:29.403Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/fc/cbb1df71dd44c9110eff4eaaeda9d44f2d06182fe0452193be20ddfba93f/tantivy-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c477bd20b4df804d57dfc5033431bef27cde605695ae141b03abbf6ebc069129", size = 4198699, upload-time = "2025-12-02T11:56:31.359Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/4d/71abb78b774073c3ce12a4faa4351a9d910a71ffa3659526affba163873d/tantivy-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b1a1ba1113c523c7ff7b10f282d6c4074006f7ef8d71e1d973d51bf7291ddb", size = 4183585, upload-time = "2025-12-02T11:56:33.317Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3d/25/73cfbcf1a8ea49be6c42817431cac46b70a119fe64da903fcc2d92b5b511/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f51ff7196c6f31719202080ed8372d5e3d51e92c749c032fb8234f012e99744c", size = 7622530, upload-time = "2025-12-02T11:56:36.839Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/12/c8/c0d7591cdf4f7e7a9fc4da786d1ca8cd1aacffaa2be16ea6d401a8e4a566/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:550e63321bfcacc003859f2fa29c1e8e56450807b3c9a501c1add27cfb9236d9", size = 3925637, upload-time = "2025-12-02T11:56:38.425Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/09/bedfc223bffec7641b417dd7ab071134b2ef8f8550e9b1fb6014657ef52e/tantivy-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde31cc8d6e122faf7902aeea32bc008a429a6e8904e34d3468126a3ec01b016", size = 4197322, upload-time = "2025-12-02T11:56:40.411Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f5/f1/1fa5183500c8042200c9f2b840d34f5bbcfb434a1ee750e7132262d2a5c9/tantivy-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b11bd5a518b0be645320b47af8493f6a40c4f3234313e37adcf4534a564d27dd", size = 4183143, upload-time = "2025-12-02T11:56:42.048Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/2f/581519492226f97d23bd0adc95dad991ebeaa73ea6abc8bff389a3096d9a/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dae99e75b7eaa9bf5bd16ab106b416370f08c135aed0e117d62a3201cd1ffe36", size = 7610316, upload-time = "2025-12-02T11:56:45.927Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/91/40/5d7bc315ab9e6a22c5572656e8ada1c836cfa96dccf533377504fbc3c9d9/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:506e9533c5ef4d3df43bad64ffecc0aa97c76e361ea610815dc3a20a9d6b30b3", size = 3919882, upload-time = "2025-12-02T11:56:48.469Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/b9/e0ef2f57a6a72444cb66c2ffbc310ab33ffaace275f1c4b0319d84ea3f18/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dbd4f8f264dacbcc9dee542832da2173fd53deaaea03f082d95214f8b5ed6bc", size = 4196031, upload-time = "2025-12-02T11:56:50.151Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/02/bf3f8cacfd08642e14a73f7956a3fb95d58119132c98c121b9065a1f8615/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:824c643ccb640dd9e35e00c5d5054ddf3323f56fe4219d57d428a9eeea13d22c", size = 4183437, upload-time = "2025-12-02T11:56:51.818Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/44/9f1d67aa5030f7eebc966c863d1316a510a971dd8bb45651df4acdfae9ed/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7f5d29ae85dd0f23df8d15b3e7b341d4f9eb5a446bbb9640df48ac1f6d9e0c6c", size = 7623723, upload-time = "2025-12-02T11:56:55.066Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/30/6e085bd3ed9d12da3c91c185854abd70f9dfd35fb36a75ea98428d42c30b/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f2d2938fb69a74fc1bb36edfaf7f0d1596fa1264db0f377bda2195c58bcb6245", size = 3926243, upload-time = "2025-12-02T11:56:57.058Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/f5/a00d65433430f51718e5cc6938df571765d7c4e03aedec5aef4ab567aa9b/tantivy-0.25.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f5ff124c4802558e627091e780b362ca944169736caba5a372eef39a79d0ae0", size = 4207186, upload-time = "2025-12-02T11:56:58.803Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/63/61bdb12fc95f2a7f77bd419a5149bfa9f28caa76cb569bf2b6b06e1d033e/tantivy-0.25.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43b80ef62a340416139c93d19264e5f808da48e04f9305f1092b8ed22be0a5be", size = 4187312, upload-time = "2025-12-02T11:57:00.595Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tenacity"
|
||||
version = "9.1.2"
|
||||
@@ -4752,6 +4775,62 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-machine"
|
||||
version = "3.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/02/fc/37b02f6094dbb1f851145330460532176ed2f1dc70511a35828166c41e52/time_machine-3.2.0.tar.gz", hash = "sha256:a4ddd1cea17b8950e462d1805a42b20c81eb9aafc8f66b392dd5ce997e037d79", size = 14804, upload-time = "2025-12-17T23:33:02.599Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f5/e1/03aae5fbaa53859f665094af696338fc7cae733d926a024af69982712350/time_machine-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c188a9dda9fcf975022f1b325b466651b96a4dfc223c523ed7ed8d979f9bf3e8", size = 19143, upload-time = "2025-12-17T23:31:44.258Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/8f/98cb17bebb52b22ff4ec26984dd44280f9c71353c3bae0640a470e6683e5/time_machine-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17245f1cc2dd13f9d63a174be59bb2684a9e5e0a112ab707e37be92068cd655f", size = 15273, upload-time = "2025-12-17T23:31:45.246Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dd/2f/ca11e4a7897234bb9331fcc5f4ed4714481ba4012370cc79a0ae8c42ea0a/time_machine-3.2.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d9bd1de1996e76efd36ae15970206c5089fb3728356794455bd5cd8d392b5537", size = 31049, upload-time = "2025-12-17T23:31:46.613Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/ad/d17d83a59943094e6b6c6a3743caaf6811b12203c3e07a30cc7bcc2ab7ee/time_machine-3.2.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:98493cd50e8b7f941eab69b9e18e697ad69db1a0ec1959f78f3d7b0387107e5c", size = 32632, upload-time = "2025-12-17T23:31:47.72Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/50/d60576d047a0dfb5638cdfb335e9c3deb6e8528544fa0b3966a8480f72b7/time_machine-3.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31f2a33d595d9f91eb9bc7f157f0dc5721f5789f4c4a9e8b852cdedb2a7d9b16", size = 34289, upload-time = "2025-12-17T23:31:48.913Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/fe/4afa602dbdebddde6d0ea4a7fe849e49b9bb85dc3fb415725a87ccb4b471/time_machine-3.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9f78ac4213c10fbc44283edd1a29cfb7d3382484f4361783ddc057292aaa1889", size = 33175, upload-time = "2025-12-17T23:31:50.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0d/87/c152e23977c1d7d7c94eb3ed3ea45cc55971796205125c6fdff40db2c60f/time_machine-3.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c1326b09e947b360926d529a96d1d9e126ce120359b63b506ecdc6ee20755c23", size = 31170, upload-time = "2025-12-17T23:31:51.645Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/af/54acf51d0f3ade3b51eab73df6192937c9a938753ef5456dff65eb8630be/time_machine-3.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9f2949f03d15264cc15c38918a2cda8966001f0f4ebe190cbfd9c56d91aed8ac", size = 32292, upload-time = "2025-12-17T23:31:52.803Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/8b/080c8eedcd67921a52ba5bd0e075362062509ab63c86fc1a0442fad241a6/time_machine-3.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cc4bee5b0214d7dc4ebc91f4a4c600f1a598e9b5606ac751f42cb6f6740b1dbb", size = 19255, upload-time = "2025-12-17T23:31:58.057Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/17/0e5291e9eb705bf8a5a1305f826e979af307bbeb79def4ddbf4b3f9a81e0/time_machine-3.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ca036304b4460ae2fdc1b52dd8b1fa7cf1464daa427fc49567413c09aa839c1", size = 15360, upload-time = "2025-12-17T23:31:59.048Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/e8/9ab87b71d2e2b62463b9b058b7ae7ac09fb57f8fcd88729dec169d304340/time_machine-3.2.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5442735b41d7a2abc2f04579b4ca6047ed4698a8338a4fec92c7c9423e7938cb", size = 33029, upload-time = "2025-12-17T23:32:00.413Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/26/b5ca19da6f25ea905b3e10a0ea95d697c1aeba0404803a43c68f1af253e6/time_machine-3.2.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:97da3e971e505cb637079fb07ab0bcd36e33279f8ecac888ff131f45ef1e4d8d", size = 34579, upload-time = "2025-12-17T23:32:01.431Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/79/ca/6ac7ad5f10ea18cc1d9de49716ba38c32132c7b64532430d92ef240c116b/time_machine-3.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3cdda6dee4966e38aeb487309bb414c6cb23a81fc500291c77a8fcd3098832e7", size = 35961, upload-time = "2025-12-17T23:32:02.521Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/67/390dd958bed395ab32d79a9fe61fe111825c0dd4ded54dbba7e867f171e6/time_machine-3.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:33d9efd302a6998bcc8baa4d84f259f8a4081105bd3d7f7af7f1d0abd3b1c8aa", size = 34668, upload-time = "2025-12-17T23:32:03.585Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/da/57/c88fff034a4e9538b3ae7c68c9cfb283670b14d17522c5a8bc17d29f9a4b/time_machine-3.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3a0b0a33971f14145853c9bd95a6ab0353cf7e0019fa2a7aa1ae9fddfe8eab50", size = 32891, upload-time = "2025-12-17T23:32:04.656Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2d/70/ebbb76022dba0fec8f9156540fc647e4beae1680c787c01b1b6200e56d70/time_machine-3.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2d0be9e5f22c38082d247a2cdcd8a936504e9db60b7b3606855fb39f299e9548", size = 34080, upload-time = "2025-12-17T23:32:06.146Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/cd/43ad5efc88298af3c59b66769cea7f055567a85071579ed40536188530c1/time_machine-3.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c421a8eb85a4418a7675a41bf8660224318c46cc62e4751c8f1ceca752059090", size = 19318, upload-time = "2025-12-17T23:32:10.518Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/f6/084010ef7f4a3f38b5a4900923d7c85b29e797655c4f6ee4ce54d903cca8/time_machine-3.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f4e758f7727d0058c4950c66b58200c187072122d6f7a98b610530a4233ea7b", size = 15390, upload-time = "2025-12-17T23:32:11.625Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/25/aa/1cabb74134f492270dc6860cb7865859bf40ecf828be65972827646e91ad/time_machine-3.2.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:154bd3f75c81f70218b2585cc12b60762fb2665c507eec5ec5037d8756d9b4e0", size = 33115, upload-time = "2025-12-17T23:32:13.219Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5e/03/78c5d7dfa366924eb4dbfcc3fc917c39a4280ca234b12819cc1f16c03d88/time_machine-3.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d50cfe5ebea422c896ad8d278af9648412b7533b8ea6adeeee698a3fd9b1d3b7", size = 34705, upload-time = "2025-12-17T23:32:14.29Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/93/d5e877c24541f674c6869ff6e9c56833369796010190252e92c9d7ae5f0f/time_machine-3.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636576501724bd6a9124e69d86e5aef263479e89ef739c5db361469f0463a0a1", size = 36104, upload-time = "2025-12-17T23:32:15.354Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/1c/d4bae72f388f67efc9609f89b012e434bb19d9549c7a7b47d6c7d9e5c55d/time_machine-3.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:40e6f40c57197fcf7ec32d2c563f4df0a82c42cdcc3cab27f688e98f6060df10", size = 34765, upload-time = "2025-12-17T23:32:16.434Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1d/c3/ac378cf301d527d8dfad2f0db6bad0dfb1ab73212eaa56d6b96ee5d9d20b/time_machine-3.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:a1bcf0b846bbfc19a79bc19e3fa04d8c7b1e8101c1b70340ffdb689cd801ea53", size = 33010, upload-time = "2025-12-17T23:32:17.532Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/35/7ce897319accda7a6970b288a9a8c52d25227342a7508505a2b3d235b649/time_machine-3.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ae55a56c179f4fe7a62575ad5148b6ed82f6c7e5cf2f9a9ec65f2f5b067db5f5", size = 34185, upload-time = "2025-12-17T23:32:18.566Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/e7/487f0ba5fe6c58186a5e1af2a118dfa2c160fedb37ef53a7e972d410408e/time_machine-3.2.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:59d71545e62525a4b85b6de9ab5c02ee3c61110fd7f636139914a2335dcbfc9c", size = 20000, upload-time = "2025-12-17T23:32:23.058Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/17/eb2c0054c8d44dd42df84ccd434539249a9c7d0b8eb53f799be2102500ab/time_machine-3.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:999672c621c35362bc28e03ca0c7df21500195540773c25993421fd8d6cc5003", size = 15657, upload-time = "2025-12-17T23:32:24.125Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/43/21/93443b5d1dd850f8bb9442e90d817a9033dcce6bfbdd3aabbb9786251c80/time_machine-3.2.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5faf7397f0580c7b9d67288522c8d7863e85f0cffadc0f1fccdb2c3dfce5783e", size = 39216, upload-time = "2025-12-17T23:32:25.542Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9f/9e/18544cf8acc72bb1dc03762231c82ecc259733f4bb6770a7bbe5cd138603/time_machine-3.2.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d3dd886ec49f1fa5a00e844f5947e5c0f98ce574750c24b7424c6f77fc1c3e87", size = 40764, upload-time = "2025-12-17T23:32:26.643Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/f7/9fe9ce2795636a3a7467307af6bdf38bb613ddb701a8a5cd50ec713beb5e/time_machine-3.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0ecd96bc7bbe450acaaabe569d84e81688f1be8ad58d1470e42371d145fb53", size = 43526, upload-time = "2025-12-17T23:32:27.693Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/03/c1/a93e975ba9dec22e87ec92d18c28e67d36bd536f9119ffa439b2892b0c9c/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:158220e946c1c4fb8265773a0282c88c35a7e3bb5d78e3561214e3b3231166f3", size = 41727, upload-time = "2025-12-17T23:32:28.985Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/fb/e3633e5a6bbed1c76bb2e9810dabc2f8467532ffcd29b9aed404b473061a/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c1aee29bc54356f248d5d7dfdd131e12ca825e850a08c0ebdb022266d073013", size = 38952, upload-time = "2025-12-17T23:32:30.031Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/82/3d/02e9fb2526b3d6b1b45bc8e4d912d95d1cd699d1a3f6df985817d37a0600/time_machine-3.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c8ed2224f09d25b1c2fc98683613aca12f90f682a427eabb68fc824d27014e4a", size = 39829, upload-time = "2025-12-17T23:32:31.075Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/70/b4b980d126ed155c78d1879c50d60c8dcbd47bd11cb14ee7be50e0dfc07f/time_machine-3.2.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:1398980c017fe5744d66f419e0115ee48a53b00b146d738e1416c225eb610b82", size = 19303, upload-time = "2025-12-17T23:32:35.796Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/73/73/eaa33603c69a68fe2b6f54f9dd75481693d62f1d29676531002be06e2d1c/time_machine-3.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:4f8f4e35f4191ef70c2ab8ff490761ee9051b891afce2bf86dde3918eb7b537b", size = 15431, upload-time = "2025-12-17T23:32:37.244Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/76/10/b81e138e86cc7bab40cdb59d294b341e172201f4a6c84bb0ec080407977a/time_machine-3.2.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6db498686ecf6163c5aa8cf0bcd57bbe0f4081184f247edf3ee49a2612b584f9", size = 33206, upload-time = "2025-12-17T23:32:38.713Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d3/72/4deab446b579e8bd5dca91de98595c5d6bd6a17ce162abf5c5f2ce40d3d8/time_machine-3.2.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:027c1807efb74d0cd58ad16524dec94212fbe900115d70b0123399883657ac0f", size = 34792, upload-time = "2025-12-17T23:32:40.223Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/39/439c6b587ddee76d533fe972289d0646e0a5520e14dc83d0a30aeb5565f7/time_machine-3.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92432610c05676edd5e6946a073c6f0c926923123ce7caee1018dc10782c713d", size = 36187, upload-time = "2025-12-17T23:32:41.705Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/db/2da4368db15180989bab83746a857bde05ad16e78f326801c142bb747a06/time_machine-3.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c25586b62480eb77ef3d953fba273209478e1ef49654592cd6a52a68dfe56a67", size = 34855, upload-time = "2025-12-17T23:32:42.817Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/88/84/120a431fee50bc4c241425bee4d3a4910df4923b7ab5f7dff1bf0c772f08/time_machine-3.2.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6bf3a2fa738d15e0b95d14469a0b8ea42635467408d8b490e263d5d45c9a177f", size = 33222, upload-time = "2025-12-17T23:32:43.94Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/ea/89cfda82bb8c57ff91bb9a26751aa234d6d90e9b4d5ab0ad9dce0f9f0329/time_machine-3.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ce76b82276d7ad2a66cdc85dad4df19d1422b69183170a34e8fbc4c3f35502f7", size = 34270, upload-time = "2025-12-17T23:32:45.037Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/a1/142de946dc4393f910bf4564b5c3ba819906e1f49b06c9cb557519c849e4/time_machine-3.2.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4e374779021446fc2b5c29d80457ec9a3b1a5df043dc2aae07d7c1415d52323c", size = 19991, upload-time = "2025-12-17T23:32:49.933Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/62/7f17def6289901f94726921811a16b9adce46e666362c75d45730c60274f/time_machine-3.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:122310a6af9c36e9a636da32830e591e7923e8a07bdd0a43276c3a36c6821c90", size = 15707, upload-time = "2025-12-17T23:32:50.969Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/d3/3502fb9bd3acb159c18844b26c43220201a0d4a622c0c853785d07699a92/time_machine-3.2.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ba3eeb0f018cc362dd8128befa3426696a2e16dd223c3fb695fde184892d4d8c", size = 39207, upload-time = "2025-12-17T23:32:52.033Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/be/8b27f4aa296fda14a5a2ad7f588ddd450603c33415ab3f8e85b2f1a44678/time_machine-3.2.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:77d38ba664b381a7793f8786efc13b5004f0d5f672dae814430445b8202a67a6", size = 40764, upload-time = "2025-12-17T23:32:53.167Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/cd/fe4c4e5c8ab6d48fab3624c32be9116fb120173a35fe67e482e5cf68b3d2/time_machine-3.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f09abeb8f03f044d72712207e0489a62098ad3ad16dac38927fcf80baca4d6a7", size = 43508, upload-time = "2025-12-17T23:32:54.597Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b4/28/5a3ba2fce85b97655a425d6bb20a441550acd2b304c96b2c19d3839f721a/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6b28367ce4f73987a55e230e1d30a57a3af85da8eb1a140074eb6e8c7e6ef19f", size = 41712, upload-time = "2025-12-17T23:32:55.781Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/58/e38084be7fdabb4835db68a3a47e58c34182d79fc35df1ecbe0db2c5359f/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:903c7751c904581da9f7861c3015bed7cdc40047321291d3694a3cdc783bbca3", size = 38939, upload-time = "2025-12-17T23:32:56.867Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytag"
|
||||
version = "2.2.1"
|
||||
@@ -5474,18 +5553,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/db/eb/d5583a11486211f3ebd4b385545ae787f32363d453c19fffd81106c9c138/whitenoise-6.12.0-py3-none-any.whl", hash = "sha256:fc5e8c572e33ebf24795b47b6a7da8da3c00cff2349f5b04c02f28d0cc5a3cc2", size = 20302, upload-time = "2026-02-27T00:05:40.086Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "whoosh-reloaded"
|
||||
version = "2.7.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cached-property", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/17/51/3fb4b9fdeaaf96512514ccf2871186333ce41a0de2ea48236a4056a5f6af/Whoosh-Reloaded-2.7.5.tar.gz", hash = "sha256:39ed7dfbd1fec97af33933107bdf78110728375ed0f2abb25dec6dbfdcb279d8", size = 1061606, upload-time = "2024-02-02T20:06:42.285Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/69/90/866dfe421f188217ecd7339585e961034a7f4fdc96b62cec3b40a50dbdef/Whoosh_Reloaded-2.7.5-py2.py3-none-any.whl", hash = "sha256:2ab6aeeafb359fbff4beb3c704b960fd88240354f3363f1c5bdb5c2325cae80e", size = 551793, upload-time = "2024-02-02T20:06:39.868Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wrapt"
|
||||
version = "2.0.1"
|
||||
|
||||
Reference in New Issue
Block a user