From 5202b0880ee0fa4b4d8cad134e6c71e0ca0fd205 Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Sat, 13 Jun 2026 06:10:44 -0700 Subject: [PATCH] Enhancement(beta): let name matching short-circuit on taxonomy hints Co-Authored-By: Claude Opus 4.8 --- src/paperless_ai/matching.py | 49 ++++++++++--- src/paperless_ai/tests/test_matching.py | 97 +++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 11 deletions(-) diff --git a/src/paperless_ai/matching.py b/src/paperless_ai/matching.py index c47c95001..df5438e14 100644 --- a/src/paperless_ai/matching.py +++ b/src/paperless_ai/matching.py @@ -15,40 +15,56 @@ MATCH_THRESHOLD = 0.8 logger = logging.getLogger("paperless_ai.matching") -def match_tags_by_name(names: list[str], user: User) -> list[Tag]: +def match_tags_by_name( + names: list[str], + user: User, + hinted_names: set[str] | None = None, +) -> list[Tag]: queryset = get_objects_for_user_owner_aware( user, ["view_tag"], Tag, ) - return _match_names_to_queryset(names, queryset, "name") + return _match_names_to_queryset(names, queryset, "name", hinted_names) -def match_correspondents_by_name(names: list[str], user: User) -> list[Correspondent]: +def match_correspondents_by_name( + names: list[str], + user: User, + hinted_names: set[str] | None = None, +) -> list[Correspondent]: queryset = get_objects_for_user_owner_aware( user, ["view_correspondent"], Correspondent, ) - return _match_names_to_queryset(names, queryset, "name") + return _match_names_to_queryset(names, queryset, "name", hinted_names) -def match_document_types_by_name(names: list[str], user: User) -> list[DocumentType]: +def match_document_types_by_name( + names: list[str], + user: User, + hinted_names: set[str] | None = None, +) -> list[DocumentType]: queryset = get_objects_for_user_owner_aware( user, ["view_documenttype"], DocumentType, ) - return _match_names_to_queryset(names, queryset, "name") + return _match_names_to_queryset(names, queryset, "name", hinted_names) -def match_storage_paths_by_name(names: list[str], user: User) -> list[StoragePath]: +def match_storage_paths_by_name( + names: list[str], + user: User, + hinted_names: set[str] | None = None, +) -> list[StoragePath]: queryset = get_objects_for_user_owner_aware( user, ["view_storagepath"], StoragePath, ) - return _match_names_to_queryset(names, queryset, "name") + return _match_names_to_queryset(names, queryset, "name", hinted_names) def _normalize(s: str) -> str: @@ -58,10 +74,18 @@ def _normalize(s: str) -> str: return s -def _match_names_to_queryset(names: list[str], queryset, attr: str): +def _match_names_to_queryset( + names: list[str], + queryset, + attr: str, + hinted_names: set[str] | None = None, +): results = [] objects = list(queryset) object_names = [_normalize(getattr(obj, attr)) for obj in objects] + normalized_hints = ( + {_normalize(name) for name in hinted_names} if hinted_names else set() + ) for name in names: if not name: @@ -76,6 +100,11 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str): results.append(matched) continue + # A hinted name that didn't exact-match came from existing taxonomy + # verbatim; do not fuzzy-map it onto a different object. + if target in normalized_hints: + continue + # Fuzzy match fallback matches = difflib.get_close_matches( target, @@ -88,8 +117,6 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str): matched = objects.pop(index) object_names.pop(index) results.append(matched) - else: - pass return results diff --git a/src/paperless_ai/tests/test_matching.py b/src/paperless_ai/tests/test_matching.py index 5cf23f2b8..1bff6939f 100644 --- a/src/paperless_ai/tests/test_matching.py +++ b/src/paperless_ai/tests/test_matching.py @@ -1,6 +1,8 @@ +import difflib from unittest.mock import patch import pytest +import pytest_mock from django.test import TestCase from documents.models import Correspondent @@ -87,6 +89,101 @@ class TestAIMatching(TestCase): self.assertEqual(result[1].name, "Test Tag 2") +@pytest.mark.django_db +class TestHintedMatching: + def test_hinted_verbatim_skips_fuzzy( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + Tag.objects.create(name="Bloodwork") + mocker.patch( + "paperless_ai.matching.get_objects_for_user_owner_aware", + return_value=Tag.objects.all(), + ) + spy = mocker.spy(difflib, "get_close_matches") + + result = match_tags_by_name( + ["Bloodwork"], + user=None, + hinted_names={"Bloodwork"}, + ) + + assert [t.name for t in result] == ["Bloodwork"] + spy.assert_not_called() + + def test_unhinted_name_still_fuzzy_matches( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + Tag.objects.create(name="Bloodwork") + mocker.patch( + "paperless_ai.matching.get_objects_for_user_owner_aware", + return_value=Tag.objects.all(), + ) + + # "Bloodwrok" is a typo not in hints -> fuzzy still maps it to Bloodwork. + result = match_tags_by_name( + ["Bloodwrok"], + user=None, + hinted_names={"Taxes"}, + ) + + assert [t.name for t in result] == ["Bloodwork"] + + def test_hinted_name_with_whitespace_exact_matches( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + Tag.objects.create(name="Bloodwork") + mocker.patch( + "paperless_ai.matching.get_objects_for_user_owner_aware", + return_value=Tag.objects.all(), + ) + spy = mocker.spy(difflib, "get_close_matches") + + result = match_tags_by_name( + ["Bloodwork "], + user=None, + hinted_names={"Bloodwork"}, + ) + + assert [t.name for t in result] == ["Bloodwork"] + spy.assert_not_called() + + def test_hinted_name_absent_from_queryset_is_skipped_not_fuzzed( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + # A hint with no exact object must not fall through to fuzzy. + Tag.objects.create(name="Bloodwork") + mocker.patch( + "paperless_ai.matching.get_objects_for_user_owner_aware", + return_value=Tag.objects.all(), + ) + + result = match_tags_by_name( + ["Bloodwrok"], + user=None, + hinted_names={"Bloodwrok"}, + ) + + assert result == [] + + def test_backward_compatible_without_kwarg( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + Tag.objects.create(name="Test Tag 1") + mocker.patch( + "paperless_ai.matching.get_objects_for_user_owner_aware", + return_value=Tag.objects.all(), + ) + + result = match_tags_by_name(["Test Tag 1", "Nonexistent"], user=None) + + assert [t.name for t in result] == ["Test Tag 1"] + + @pytest.mark.django_db class TestExtractUnmatchedNamesNormalization: def test_punctuated_name_already_matched_is_not_returned_as_unmatched(