mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-28 16:24:19 +00:00
Enhancement(beta): let name matching short-circuit on taxonomy hints
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -15,40 +15,56 @@ MATCH_THRESHOLD = 0.8
|
||||
logger = logging.getLogger("paperless_ai.matching")
|
||||
|
||||
|
||||
def match_tags_by_name(names: list[str], user: User) -> list[Tag]:
|
||||
def match_tags_by_name(
|
||||
names: list[str],
|
||||
user: User,
|
||||
hinted_names: set[str] | None = None,
|
||||
) -> list[Tag]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_tag"],
|
||||
Tag,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
return _match_names_to_queryset(names, queryset, "name", hinted_names)
|
||||
|
||||
|
||||
def match_correspondents_by_name(names: list[str], user: User) -> list[Correspondent]:
|
||||
def match_correspondents_by_name(
|
||||
names: list[str],
|
||||
user: User,
|
||||
hinted_names: set[str] | None = None,
|
||||
) -> list[Correspondent]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_correspondent"],
|
||||
Correspondent,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
return _match_names_to_queryset(names, queryset, "name", hinted_names)
|
||||
|
||||
|
||||
def match_document_types_by_name(names: list[str], user: User) -> list[DocumentType]:
|
||||
def match_document_types_by_name(
|
||||
names: list[str],
|
||||
user: User,
|
||||
hinted_names: set[str] | None = None,
|
||||
) -> list[DocumentType]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_documenttype"],
|
||||
DocumentType,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
return _match_names_to_queryset(names, queryset, "name", hinted_names)
|
||||
|
||||
|
||||
def match_storage_paths_by_name(names: list[str], user: User) -> list[StoragePath]:
|
||||
def match_storage_paths_by_name(
|
||||
names: list[str],
|
||||
user: User,
|
||||
hinted_names: set[str] | None = None,
|
||||
) -> list[StoragePath]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_storagepath"],
|
||||
StoragePath,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
return _match_names_to_queryset(names, queryset, "name", hinted_names)
|
||||
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
@@ -58,10 +74,18 @@ def _normalize(s: str) -> str:
|
||||
return s
|
||||
|
||||
|
||||
def _match_names_to_queryset(names: list[str], queryset, attr: str):
|
||||
def _match_names_to_queryset(
|
||||
names: list[str],
|
||||
queryset,
|
||||
attr: str,
|
||||
hinted_names: set[str] | None = None,
|
||||
):
|
||||
results = []
|
||||
objects = list(queryset)
|
||||
object_names = [_normalize(getattr(obj, attr)) for obj in objects]
|
||||
normalized_hints = (
|
||||
{_normalize(name) for name in hinted_names} if hinted_names else set()
|
||||
)
|
||||
|
||||
for name in names:
|
||||
if not name:
|
||||
@@ -76,6 +100,11 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str):
|
||||
results.append(matched)
|
||||
continue
|
||||
|
||||
# A hinted name that didn't exact-match came from existing taxonomy
|
||||
# verbatim; do not fuzzy-map it onto a different object.
|
||||
if target in normalized_hints:
|
||||
continue
|
||||
|
||||
# Fuzzy match fallback
|
||||
matches = difflib.get_close_matches(
|
||||
target,
|
||||
@@ -88,8 +117,6 @@ def _match_names_to_queryset(names: list[str], queryset, attr: str):
|
||||
matched = objects.pop(index)
|
||||
object_names.pop(index)
|
||||
results.append(matched)
|
||||
else:
|
||||
pass
|
||||
return results
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import difflib
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Correspondent
|
||||
@@ -87,6 +89,101 @@ class TestAIMatching(TestCase):
|
||||
self.assertEqual(result[1].name, "Test Tag 2")
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestHintedMatching:
|
||||
def test_hinted_verbatim_skips_fuzzy(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
Tag.objects.create(name="Bloodwork")
|
||||
mocker.patch(
|
||||
"paperless_ai.matching.get_objects_for_user_owner_aware",
|
||||
return_value=Tag.objects.all(),
|
||||
)
|
||||
spy = mocker.spy(difflib, "get_close_matches")
|
||||
|
||||
result = match_tags_by_name(
|
||||
["Bloodwork"],
|
||||
user=None,
|
||||
hinted_names={"Bloodwork"},
|
||||
)
|
||||
|
||||
assert [t.name for t in result] == ["Bloodwork"]
|
||||
spy.assert_not_called()
|
||||
|
||||
def test_unhinted_name_still_fuzzy_matches(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
Tag.objects.create(name="Bloodwork")
|
||||
mocker.patch(
|
||||
"paperless_ai.matching.get_objects_for_user_owner_aware",
|
||||
return_value=Tag.objects.all(),
|
||||
)
|
||||
|
||||
# "Bloodwrok" is a typo not in hints -> fuzzy still maps it to Bloodwork.
|
||||
result = match_tags_by_name(
|
||||
["Bloodwrok"],
|
||||
user=None,
|
||||
hinted_names={"Taxes"},
|
||||
)
|
||||
|
||||
assert [t.name for t in result] == ["Bloodwork"]
|
||||
|
||||
def test_hinted_name_with_whitespace_exact_matches(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
Tag.objects.create(name="Bloodwork")
|
||||
mocker.patch(
|
||||
"paperless_ai.matching.get_objects_for_user_owner_aware",
|
||||
return_value=Tag.objects.all(),
|
||||
)
|
||||
spy = mocker.spy(difflib, "get_close_matches")
|
||||
|
||||
result = match_tags_by_name(
|
||||
["Bloodwork "],
|
||||
user=None,
|
||||
hinted_names={"Bloodwork"},
|
||||
)
|
||||
|
||||
assert [t.name for t in result] == ["Bloodwork"]
|
||||
spy.assert_not_called()
|
||||
|
||||
def test_hinted_name_absent_from_queryset_is_skipped_not_fuzzed(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
# A hint with no exact object must not fall through to fuzzy.
|
||||
Tag.objects.create(name="Bloodwork")
|
||||
mocker.patch(
|
||||
"paperless_ai.matching.get_objects_for_user_owner_aware",
|
||||
return_value=Tag.objects.all(),
|
||||
)
|
||||
|
||||
result = match_tags_by_name(
|
||||
["Bloodwrok"],
|
||||
user=None,
|
||||
hinted_names={"Bloodwrok"},
|
||||
)
|
||||
|
||||
assert result == []
|
||||
|
||||
def test_backward_compatible_without_kwarg(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
Tag.objects.create(name="Test Tag 1")
|
||||
mocker.patch(
|
||||
"paperless_ai.matching.get_objects_for_user_owner_aware",
|
||||
return_value=Tag.objects.all(),
|
||||
)
|
||||
|
||||
result = match_tags_by_name(["Test Tag 1", "Nonexistent"], user=None)
|
||||
|
||||
assert [t.name for t in result] == ["Test Tag 1"]
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestExtractUnmatchedNamesNormalization:
|
||||
def test_punctuated_name_already_matched_is_not_returned_as_unmatched(
|
||||
|
||||
Reference in New Issue
Block a user