refactor(search): replace NLTK autocomplete extraction with regex \w+ + timeout

NLTK was inappropriate here: no stopword filtering (users should be able to autocomplete any word), no length floor, and unicode-aware \w+ splits consistently with Tantivy's simple tokenizer. regex library used (already a project dependency) for ReDoS protection via per-call timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 06:05:24 +00:00 · 2026-03-30 08:38:22 -07:00
parent 0fb57205db
commit fcd4d28f37
1 changed files with 23 additions and 81 deletions
@@ -13,6 +13,7 @@ from typing import TypedDict
 from typing import TypeVar

 import filelock
+import regex
 import tantivy
 from django.conf import settings
 from django.utils.timezone import get_current_timezone
@@ -37,6 +38,9 @@ if TYPE_CHECKING:

 logger = logging.getLogger("paperless.search")

+_WORD_RE = regex.compile(r"\w+")
+_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0  # seconds; guards against ReDoS on untrusted content
+
 T = TypeVar("T")


@@ -51,89 +55,27 @@ def _ascii_fold(s: str) -> str:


 def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
-    """Extract and normalize words for autocomplete, filtering stopwords."""
+    """Extract and normalize words for autocomplete.
+
+    Splits on non-word characters (matching Tantivy's simple tokenizer), lowercases,
+    and ascii-folds each token. Uses the regex library with a timeout to guard against
+    ReDoS on untrusted document content.
+    """
    words = set()
-
-    # Use NLTK if enabled
-    if settings.NLTK_ENABLED and settings.NLTK_LANGUAGE:
+    for text in text_sources:
+        if not text:
+            continue
        try:
-            import nltk
-            from nltk.corpus import stopwords
-            from nltk.tokenize import word_tokenize
-
-            # Set NLTK data path
-            nltk.data.path = [settings.NLTK_DIR]
-
-            # Get stopwords for the configured language
-            try:
-                stopwords.ensure_loaded()
-                stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
-            except (AttributeError, OSError) as e:
-                logger.debug(f"Could not load NLTK stopwords: {e}")
-                stop_words = frozenset()
-
-            for text in text_sources:
-                if text:
-                    try:
-                        tokens = word_tokenize(
-                            text.lower(),
-                            language=settings.NLTK_LANGUAGE,
-                        )
-                        for token in tokens:
-                            if (
-                                token.isalpha()
-                                and len(token) > 2
-                                and token not in stop_words
-                            ):
-                                normalized = _ascii_fold(token)
-                                if normalized:
-                                    words.add(normalized)
-                    except Exception as e:
-                        logger.debug(f"NLTK tokenization failed: {e}")
-                        # Fallback to regex
-                        import re
-
-                        tokens = re.findall(r"\b[a-zA-Z]{3,}\b", text)
-                        for token in tokens:
-                            normalized = _ascii_fold(token.lower())
-                            if normalized and normalized not in stop_words:
-                                words.add(normalized)
-
-        except ImportError:
-            logger.debug("NLTK not available, using fallback tokenization")
-            # Fall through to basic tokenization
-        except Exception as e:
-            logger.debug(f"NLTK initialization failed: {e}")
-            # Fall through to basic tokenization
-
-    # Fallback tokenization when NLTK is disabled or unavailable
-    if not words:  # Only use fallback if NLTK didn't produce results
-        import re
-
-        basic_stopwords = {
-            "the",
-            "a",
-            "an",
-            "and",
-            "or",
-            "but",
-            "in",
-            "on",
-            "at",
-            "to",
-            "for",
-            "of",
-            "with",
-            "by",
-        }
-        for text in text_sources:
-            if text:
-                tokens = re.findall(r"\b[a-zA-Z]{3,}\b", text)
-                for token in tokens:
-                    normalized = _ascii_fold(token.lower())
-                    if normalized and normalized not in basic_stopwords:
-                        words.add(normalized)
-
+            tokens = _WORD_RE.findall(text, timeout=_AUTOCOMPLETE_REGEX_TIMEOUT)
+        except regex.TimeoutError:
+            logger.warning(
+                "Autocomplete word extraction timed out for a text source; skipping.",
+            )
+            continue
+        for token in tokens:
+            normalized = _ascii_fold(token.lower())
+            if normalized:
+                words.add(normalized)
    return words