mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-02 22:28:51 +00:00
Add simple text search mode and API param
This commit is contained in:
@@ -7,6 +7,7 @@ from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
from typing import TypedDict
|
||||
@@ -20,6 +21,7 @@ from django.utils.timezone import get_current_timezone
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import parse_simple_text_query
|
||||
from documents.search._query import parse_user_query
|
||||
from documents.search._schema import _write_sentinels
|
||||
from documents.search._schema import build_schema
|
||||
@@ -45,6 +47,11 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class SearchMode(StrEnum):
|
||||
QUERY = "query"
|
||||
TEXT = "text"
|
||||
|
||||
|
||||
def _ascii_fold(s: str) -> str:
|
||||
"""
|
||||
Normalize unicode to ASCII equivalent characters for search consistency.
|
||||
@@ -433,6 +440,7 @@ class TantivyBackend:
|
||||
sort_field: str | None,
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
) -> SearchResults:
|
||||
"""
|
||||
Execute a search query against the document index.
|
||||
@@ -441,20 +449,28 @@ class TantivyBackend:
|
||||
permission filtering before executing against Tantivy. Supports both
|
||||
relevance-based and field-based sorting.
|
||||
|
||||
QUERY search mode supports natural date keywords, field filters, etc.
|
||||
TEXT search mode treats the query as plain text to search for in title and content
|
||||
|
||||
Args:
|
||||
query: User's search query (supports natural date keywords, field filters)
|
||||
query: User's search query
|
||||
user: User for permission filtering (None for superuser/no filtering)
|
||||
page: Page number (1-indexed) for pagination
|
||||
page_size: Number of results per page
|
||||
sort_field: Field to sort by (None for relevance ranking)
|
||||
sort_reverse: Whether to reverse the sort order
|
||||
search_mode: "query" for advanced Tantivy syntax, "text" for
|
||||
plain-text search over title and content only
|
||||
|
||||
Returns:
|
||||
SearchResults with hits, total count, and processed query
|
||||
"""
|
||||
self._ensure_open()
|
||||
tz = get_current_timezone()
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
if search_mode is SearchMode.TEXT:
|
||||
user_query = parse_simple_text_query(self._index, query)
|
||||
else:
|
||||
user_query = parse_user_query(self._index, query, tz)
|
||||
|
||||
# Apply permission filter if user is not None (not superuser)
|
||||
if user is not None:
|
||||
|
||||
@@ -51,6 +51,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
|
||||
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
|
||||
_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])')
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
@@ -436,6 +437,7 @@ DEFAULT_SEARCH_FIELDS = [
|
||||
"document_type",
|
||||
"tag",
|
||||
]
|
||||
SIMPLE_SEARCH_FIELDS = ["title", "content"]
|
||||
_FIELD_BOOSTS = {"title": 2.0}
|
||||
|
||||
|
||||
@@ -495,3 +497,28 @@ def parse_user_query(
|
||||
)
|
||||
|
||||
return exact
|
||||
|
||||
|
||||
def parse_simple_text_query(
|
||||
index: tantivy.Index,
|
||||
raw_query: str,
|
||||
) -> tantivy.Query:
|
||||
"""
|
||||
Parse a plain-text query using Tantivy's default parser over title/content.
|
||||
|
||||
Query string is escaped and normalized to be treated as "simple" text query.
|
||||
"""
|
||||
# strips special characters that would be interpreted as syntax by the parser
|
||||
query_str = regex.sub(
|
||||
_SIMPLE_QUERY_SPECIAL_CHARS_RE,
|
||||
r"\\\1",
|
||||
raw_query,
|
||||
timeout=_REGEX_TIMEOUT,
|
||||
)
|
||||
# collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace)
|
||||
query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip()
|
||||
return index.parse_query(
|
||||
query_str,
|
||||
SIMPLE_SEARCH_FIELDS,
|
||||
field_boosts=_FIELD_BOOSTS,
|
||||
)
|
||||
|
||||
@@ -1995,6 +1995,12 @@ class ChatStreamingView(GenericAPIView):
|
||||
list=extend_schema(
|
||||
description="Document views including search",
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name="text",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Simple text search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="query",
|
||||
type=OpenApiTypes.STR,
|
||||
@@ -2033,7 +2039,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
|
||||
def _is_search_request(self):
|
||||
return (
|
||||
"query" in self.request.query_params
|
||||
"text" in self.request.query_params
|
||||
or "query" in self.request.query_params
|
||||
or "more_like_id" in self.request.query_params
|
||||
)
|
||||
|
||||
@@ -2043,6 +2050,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
from documents.search._backend import SearchMode
|
||||
|
||||
try:
|
||||
backend = get_backend()
|
||||
@@ -2051,8 +2059,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
|
||||
user = None if request.user.is_superuser else request.user
|
||||
|
||||
if "query" in request.query_params:
|
||||
query_str = request.query_params["query"]
|
||||
if "text" in request.query_params or "query" in request.query_params:
|
||||
search_mode = (
|
||||
SearchMode.TEXT
|
||||
if "text" in request.query_params
|
||||
else SearchMode.QUERY
|
||||
)
|
||||
query_str = (
|
||||
request.query_params.get("text") or request.query_params["query"]
|
||||
)
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
@@ -2060,6 +2075,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
else:
|
||||
# more_like_id — validate permission on the seed document first
|
||||
|
||||
Reference in New Issue
Block a user