Feature: Document fuzzy match improvements (#12579)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-04-15 13:59:43 -07:00
committed by GitHub
parent 2b4c1fe20d
commit 2fd1a1cf3a
2 changed files with 236 additions and 90 deletions
+64 -15
View File
@@ -1,4 +1,5 @@
from io import StringIO
from unittest.mock import patch
import pytest
from django.core.management import CommandError
@@ -6,12 +7,11 @@ from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
from documents.tests.factories import DocumentFactory
@pytest.mark.management
class TestFuzzyMatchCommand(TestCase):
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)
def test_with_matches(self) -> None:
"""
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command("--processes", "1")
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Found 1 matching pair(s)", stdout)
def test_with_3_matches(self) -> None:
"""
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
filename="final_test.pdf",
)
stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
lines = [x.strip() for x in stdout.splitlines() if x.strip()]
self.assertEqual(len(lines), 3)
for line in lines:
self.assertRegex(line, self.MSG_REGEX)
# 3 docs -> 3 unique pairs; summary confirms count and no duplication
self.assertIn("Found 3 matching pair(s)", stdout)
def test_document_deletion(self) -> None:
"""
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):
stdout, _ = self.call_command(
"--delete",
"--yes",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn(
"The command is configured to delete documents. Use with caution",
stdout,
)
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Deleting 1 documents based on ratio matches", stdout)
self.assertIn("Delete Mode", stdout)
self.assertIn("Found 1 matching pair(s)", stdout)
self.assertIn("Deleting 1 document(s)", stdout)
self.assertEqual(Document.objects.count(), 2)
self.assertIsNotNone(Document.objects.get(pk=1))
self.assertIsNotNone(Document.objects.get(pk=2))
def test_document_deletion_cancelled(self) -> None:
"""
GIVEN:
- 3 documents exist
- Document 1 to document 3 has a similarity over 85.0
WHEN:
- Command is called with --delete but user answers "n" at the prompt
THEN:
- No documents are deleted
"""
DocumentFactory(content="first document scanned by bob")
DocumentFactory(content="second document scanned by alice")
DocumentFactory(content="first document scanned by pete")
self.assertEqual(Document.objects.count(), 3)
with patch("builtins.input", return_value="n"):
stdout, _ = self.call_command(
"--delete",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn("Deletion cancelled", stdout)
self.assertEqual(Document.objects.count(), 3)
def test_empty_content(self) -> None:
"""
GIVEN:
@@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)
@pytest.mark.management
@pytest.mark.django_db
@pytest.mark.parametrize(
("content_a", "content_b"),
[
pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"), # 88-92%
pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"), # 92-97%
pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"), # ≥97%
],
)
def test_similarity_color_band(content_a: str, content_b: str) -> None:
"""Each parametrized case exercises one color branch in _render_results."""
DocumentFactory(content=content_a)
DocumentFactory(content=content_b)
stdout = StringIO()
call_command(
"document_fuzzy_match",
"--no-progress-bar",
"--processes",
"1",
stdout=stdout,
skip_checks=True,
)
assert "Found 1 matching pair(s)" in stdout.getvalue()