Enhancement(beta): let name matching short-circuit on taxonomy hints

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
stumpylog
2026-06-13 06:10:44 -07:00
parent 7ed58f9664
commit 5202b0880e
2 changed files with 135 additions and 11 deletions
+38 -11
View File
@@ -15,40 +15,56 @@ MATCH_THRESHOLD = 0.8
logger = logging.getLogger("paperless_ai.matching")
def match_tags_by_name(names: list[str], user: User) -> list[Tag]:
def match_tags_by_name(
names: list[str],
user: User,
hinted_names: set[str] | None = None,
) -> list[Tag]:
queryset = get_objects_for_user_owner_aware(
user,
["view_tag"],
Tag,
)
return _match_names_to_queryset(names, queryset, "name")
return _match_names_to_queryset(names, queryset, "name", hinted_names)
def match_correspondents_by_name(names: list[str], user: User) -> list[Correspondent]:
def match_correspondents_by_name(
names: list[str],
user: User,
hinted_names: set[str] | None = None,
) -> list[Correspondent]:
queryset = get_objects_for_user_owner_aware(
user,
["view_correspondent"],
Correspondent,
)
return _match_names_to_queryset(names, queryset, "name")
return _match_names_to_queryset(names, queryset, "name", hinted_names)
def match_document_types_by_name(names: list[str], user: User) -> list[DocumentType]:
def match_document_types_by_name(
names: list[str],
user: User,
hinted_names: set[str] | None = None,
) -> list[DocumentType]:
queryset = get_objects_for_user_owner_aware(
user,
["view_documenttype"],
DocumentType,
)
return _match_names_to_queryset(names, queryset, "name")
return _match_names_to_queryset(names, queryset, "name", hinted_names)
def match_storage_paths_by_name(names: list[str], user: User) -> list[StoragePath]:
def match_storage_paths_by_name(
names: list[str],
user: User,
hinted_names: set[str] | None = None,
) -> list[StoragePath]:
queryset = get_objects_for_user_owner_aware(
user,
["view_storagepath"],
StoragePath,
)
return _match_names_to_queryset(names, queryset, "name")
return _match_names_to_queryset(names, queryset, "name", hinted_names)
def _normalize(s: str) -> str:
@@ -58,10 +74,18 @@ def _normalize(s: str) -> str:
return s
def _match_names_to_queryset(names: list[str], queryset, attr: str):
def _match_names_to_queryset(
names: list[str],
queryset,
attr: str,
hinted_names: set[str] | None = None,
):
results = []
objects = list(queryset)
object_names = [_normalize(getattr(obj, attr)) for obj in objects]
normalized_hints = (
{_normalize(name) for name in hinted_names} if hinted_names else set()
)
for name in names:
if not name:
@@ -76,6 +100,11 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str):
results.append(matched)
continue
# A hinted name that didn't exact-match came from existing taxonomy
# verbatim; do not fuzzy-map it onto a different object.
if target in normalized_hints:
continue
# Fuzzy match fallback
matches = difflib.get_close_matches(
target,
@@ -88,8 +117,6 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str):
matched = objects.pop(index)
object_names.pop(index)
results.append(matched)
else:
pass
return results
+97
View File
@@ -1,6 +1,8 @@
import difflib
from unittest.mock import patch
import pytest
import pytest_mock
from django.test import TestCase
from documents.models import Correspondent
@@ -87,6 +89,101 @@ class TestAIMatching(TestCase):
self.assertEqual(result[1].name, "Test Tag 2")
@pytest.mark.django_db
class TestHintedMatching:
def test_hinted_verbatim_skips_fuzzy(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
Tag.objects.create(name="Bloodwork")
mocker.patch(
"paperless_ai.matching.get_objects_for_user_owner_aware",
return_value=Tag.objects.all(),
)
spy = mocker.spy(difflib, "get_close_matches")
result = match_tags_by_name(
["Bloodwork"],
user=None,
hinted_names={"Bloodwork"},
)
assert [t.name for t in result] == ["Bloodwork"]
spy.assert_not_called()
def test_unhinted_name_still_fuzzy_matches(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
Tag.objects.create(name="Bloodwork")
mocker.patch(
"paperless_ai.matching.get_objects_for_user_owner_aware",
return_value=Tag.objects.all(),
)
# "Bloodwrok" is a typo not in hints -> fuzzy still maps it to Bloodwork.
result = match_tags_by_name(
["Bloodwrok"],
user=None,
hinted_names={"Taxes"},
)
assert [t.name for t in result] == ["Bloodwork"]
def test_hinted_name_with_whitespace_exact_matches(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
Tag.objects.create(name="Bloodwork")
mocker.patch(
"paperless_ai.matching.get_objects_for_user_owner_aware",
return_value=Tag.objects.all(),
)
spy = mocker.spy(difflib, "get_close_matches")
result = match_tags_by_name(
["Bloodwork "],
user=None,
hinted_names={"Bloodwork"},
)
assert [t.name for t in result] == ["Bloodwork"]
spy.assert_not_called()
def test_hinted_name_absent_from_queryset_is_skipped_not_fuzzed(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
# A hint with no exact object must not fall through to fuzzy.
Tag.objects.create(name="Bloodwork")
mocker.patch(
"paperless_ai.matching.get_objects_for_user_owner_aware",
return_value=Tag.objects.all(),
)
result = match_tags_by_name(
["Bloodwrok"],
user=None,
hinted_names={"Bloodwrok"},
)
assert result == []
def test_backward_compatible_without_kwarg(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
Tag.objects.create(name="Test Tag 1")
mocker.patch(
"paperless_ai.matching.get_objects_for_user_owner_aware",
return_value=Tag.objects.all(),
)
result = match_tags_by_name(["Test Tag 1", "Nonexistent"], user=None)
assert [t.name for t in result] == ["Test Tag 1"]
@pytest.mark.django_db
class TestExtractUnmatchedNamesNormalization:
def test_punctuated_name_already_matched_is_not_returned_as_unmatched(