From 0078ef9cd5bf146582cb0aa26bda5fb3f1c51497 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:26:07 -0700
Subject: [PATCH] refactor(search): add docstrings and complete type
 annotations to all search module functions

- Add descriptive docstrings to all functions in _schema.py, _tokenizer.py, and _query.py
- Complete type annotations for all function parameters and return values
- Fix 8 mypy strict errors in _query.py:
  - Add re.Match[str] type parameters for regex matches
  - Fix "Returning Any" error with str() cast
  - Add type annotations for build_permission_filter() and parse_user_query()
  - Remove lazy imports, move to module top level
- All 29 search module tests continue to pass

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/search/_query.py     | 59 +++++++++++++++++++-----------
 src/documents/search/_schema.py    |  4 ++
 src/documents/search/_tokenizer.py |  6 +--
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py
index b708f1d02..4176410a1 100644
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -7,9 +7,14 @@ from datetime import datetime
 from datetime import timedelta
 from typing import TYPE_CHECKING
 
+import tantivy
+from django.conf import settings
+
 if TYPE_CHECKING:
     from datetime import tzinfo
 
+    from django.contrib.auth.base_user import AbstractBaseUser
+
 _DATE_ONLY_FIELDS = frozenset({"created"})
 
 _DATE_KEYWORDS = frozenset(
@@ -36,10 +41,12 @@ _RELATIVE_RANGE_RE = re.compile(
 
 
 def _fmt(dt: datetime) -> str:
+    """Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
     return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
 
 
 def _iso_range(lo: datetime, hi: datetime) -> str:
+    """Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
     return f"[{_fmt(lo)} TO {_fmt(hi)}]"
 
 
@@ -144,7 +151,9 @@ def _datetime_range(keyword: str, tz: tzinfo) -> str:
 
 
 def _rewrite_compact_date(query: str) -> str:
-    def _sub(m: re.Match) -> str:
+    """Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601."""
+
+    def _sub(m: re.Match[str]) -> str:
         raw = m.group(1)
         try:
             dt = datetime(
@@ -158,13 +167,15 @@ def _rewrite_compact_date(query: str) -> str:
             )
             return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
         except ValueError:
-            return m.group(0)
+            return str(m.group(0))
 
     return _COMPACT_DATE_RE.sub(_sub, query)
 
 
 def _rewrite_relative_range(query: str) -> str:
-    def _sub(m: re.Match) -> str:
+    """Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries."""
+
+    def _sub(m: re.Match[str]) -> str:
         now = datetime.now(UTC)
 
         def _offset(s: str | None) -> timedelta:
@@ -198,7 +209,7 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
     query = _rewrite_compact_date(query)
     query = _rewrite_relative_range(query)
 
-    def _replace(m: re.Match) -> str:
+    def _replace(m: re.Match[str]) -> str:
         field, keyword = m.group(1), m.group(2)
         if field in _DATE_ONLY_FIELDS:
             return f"{field}:{_date_only_range(keyword, tz)}"
@@ -216,7 +227,7 @@ def normalize_query(query: str) -> str:
     tag:foo,bar → tag:foo AND tag:bar
     """
 
-    def _expand(m: re.Match) -> str:
+    def _expand(m: re.Match[str]) -> str:
         field = m.group(1)
         values = [v.strip() for v in m.group(2).split(",") if v.strip()]
         return " AND ".join(f"{field}:{v}" for v in values)
@@ -230,7 +241,10 @@ def normalize_query(query: str) -> str:
 _MAX_U64 = 2**64 - 1  # u64 max — used as inclusive upper bound for "any owner" range
 
 
-def build_permission_filter(schema, user):
+def build_permission_filter(
+    schema: tantivy.Schema,
+    user: AbstractBaseUser,
+) -> tantivy.Query:
     """
     Returns a Query matching documents visible to user:
     - no owner (public)      → owner_id field absent (NULL in Django)
@@ -251,36 +265,34 @@ def build_permission_filter(schema, user):
     simplify this to MustNot(exists_query("owner_id")) once released.
     See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
     """
-    import tantivy as _tantivy
-
-    owner_any = _tantivy.Query.range_query(
+    owner_any = tantivy.Query.range_query(
         schema,
         "owner_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
         1,
         _MAX_U64,
     )
-    no_owner = _tantivy.Query.boolean_query(
+    no_owner = tantivy.Query.boolean_query(
         [
-            (_tantivy.Occur.Must, _tantivy.Query.all_query()),
-            (_tantivy.Occur.MustNot, owner_any),
+            (tantivy.Occur.Must, tantivy.Query.all_query()),
+            (tantivy.Occur.MustNot, owner_any),
         ],
     )
-    owned = _tantivy.Query.range_query(
+    owned = tantivy.Query.range_query(
         schema,
         "owner_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
         user.pk,
         user.pk,
     )
-    shared = _tantivy.Query.range_query(
+    shared = tantivy.Query.range_query(
         schema,
         "viewer_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
         user.pk,
         user.pk,
     )
-    return _tantivy.Query.disjunction_max_query([no_owner, owned, shared])
+    return tantivy.Query.disjunction_max_query([no_owner, owned, shared])
 
 
 # ── parse_user_query (full pipeline) ─────────────────────────────────────────
@@ -297,8 +309,13 @@ DEFAULT_SEARCH_FIELDS = [
 _FIELD_BOOSTS = {"title": 2.0}
 
 
-def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
-    from django.conf import settings
+def parse_user_query(
+    index: tantivy.Index,
+    schema: tantivy.Schema,
+    raw_query: str,
+    tz: tzinfo,
+) -> tantivy.Query:
+    """Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse. Adds fuzzy blend if ADVANCED_FUZZY_SEARCH_THRESHOLD is set."""
 
     query_str = rewrite_natural_date_keywords(raw_query, tz)
     query_str = normalize_query(query_str)
@@ -311,8 +328,6 @@ def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
 
     threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
     if threshold is not None:
-        import tantivy
-
         fuzzy = index.parse_query(
             query_str,
             DEFAULT_SEARCH_FIELDS,
diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py
index 3a68baa7b..5724d97a0 100644
--- a/src/documents/search/_schema.py
+++ b/src/documents/search/_schema.py
@@ -16,6 +16,7 @@ SCHEMA_VERSION = 1
 
 
 def build_schema() -> tantivy.Schema:
+    """Build the Tantivy schema for the paperless document index."""
     sb = tantivy.SchemaBuilder()
 
     sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -70,6 +71,7 @@ def build_schema() -> tantivy.Schema:
 
 
 def _needs_rebuild(index_dir: Path) -> bool:
+    """Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
     version_file = index_dir / ".schema_version"
     if not version_file.exists():
         return True
@@ -92,6 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
 
 
 def _wipe_index(index_dir: Path) -> None:
+    """Delete all children in the index directory to prepare for rebuild."""
     for child in list(index_dir.iterdir()):
         if child.is_dir():
             shutil.rmtree(child)
@@ -100,6 +103,7 @@ def _wipe_index(index_dir: Path) -> None:
 
 
 def _write_sentinels(index_dir: Path) -> None:
+    """Write schema version and language sentinel files so the next index open can skip rebuilding."""
     (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
     (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
 
diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py
index 5f656f585..ed30d5c37 100644
--- a/src/documents/search/_tokenizer.py
+++ b/src/documents/search/_tokenizer.py
@@ -67,7 +67,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
 
 
 def _paperless_text(language: str) -> tantivy.TextAnalyzer:
-    """simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
+    """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
     builder = (
         tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
         .filter(tantivy.Filter.remove_long(65))
@@ -88,7 +88,7 @@ def _paperless_text(language: str) -> tantivy.TextAnalyzer:
 
 
 def _simple_analyzer() -> tantivy.TextAnalyzer:
-    """simple -> lowercase -> ascii_fold. Used for shadow sort fields."""
+    """Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold."""
     return (
         tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
         .filter(tantivy.Filter.lowercase())
@@ -98,7 +98,7 @@ def _simple_analyzer() -> tantivy.TextAnalyzer:
 
 
 def _bigram_analyzer() -> tantivy.TextAnalyzer:
-    """ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
+    """Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
     return (
         tantivy.TextAnalyzerBuilder(
             tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),