diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index 36ac83356..a82938a39 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -81,8 +81,10 @@ def build_schema() -> tantivy.Schema: tokenizer_name="simple_search_analyzer", ) - # Autocomplete prefix scan - stored, not indexed - sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw") + # Autocomplete prefix scan via terms_with_prefix, which walks the field's + # term dictionary - so the field must be indexed (term dict), not stored. + # The stored value is never read back, so storing it only wastes space. + sb.add_text_field("autocomplete_word", stored=False, tokenizer_name="raw") sb.add_text_field("tag", stored=True, tokenizer_name="paperless_text") diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 830d00113..ad82e1a8a 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -987,29 +987,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): THEN: - The similar documents are returned from the API request """ - d1 = Document.objects.create( + # Distinct created/added dates: documents created at the same instant + # share a timestamp term, and more_like_this (which cannot be scoped to + # content fields) would then match on it, surfacing unrelated documents. + d1 = DocumentFactory( title="invoice", content="the thing i bought at a shop and paid with bank account", - checksum="A", - pk=1, + created=datetime.date(2018, 1, 1), + added=timezone.make_aware(datetime.datetime(2018, 1, 1)), ) - d2 = Document.objects.create( + d2 = DocumentFactory( title="bank statement 1", content="things i paid for in august", - pk=2, - checksum="B", + created=datetime.date(2019, 3, 4), + added=timezone.make_aware(datetime.datetime(2019, 3, 4)), ) - d3 = Document.objects.create( + d3 = DocumentFactory( title="bank statement 3", content="things i paid for in september", - pk=3, - checksum="C", + created=datetime.date(2020, 7, 9), + added=timezone.make_aware(datetime.datetime(2020, 7, 9)), ) - d4 = Document.objects.create( + d4 = DocumentFactory( title="Quarterly Report", content="quarterly revenue profit margin earnings growth", - pk=4, - checksum="ABC", + created=datetime.date(2021, 11, 30), + added=timezone.make_aware(datetime.datetime(2021, 11, 30)), ) backend = get_backend() backend.add_or_update(d1)