mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-23 06:55:23 +00:00
Feature: Document fuzzy match improvements (#12579)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.core.management import CommandError
|
||||
@@ -6,12 +7,11 @@ from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document
|
||||
from documents.tests.factories import DocumentFactory
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestFuzzyMatchCommand(TestCase):
|
||||
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
|
||||
|
||||
def call_command(self, *args, **kwargs):
|
||||
stdout = StringIO()
|
||||
stderr = StringIO()
|
||||
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
self.assertIn("No matches found", stdout)
|
||||
self.assertIn("No duplicate documents found", stdout)
|
||||
|
||||
def test_with_matches(self) -> None:
|
||||
"""
|
||||
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command("--processes", "1")
|
||||
self.assertRegex(stdout, self.MSG_REGEX)
|
||||
self.assertIn("Found 1 matching pair(s)", stdout)
|
||||
|
||||
def test_with_3_matches(self) -> None:
|
||||
"""
|
||||
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
filename="final_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
|
||||
lines = [x.strip() for x in stdout.splitlines() if x.strip()]
|
||||
self.assertEqual(len(lines), 3)
|
||||
for line in lines:
|
||||
self.assertRegex(line, self.MSG_REGEX)
|
||||
# 3 docs -> 3 unique pairs; summary confirms count and no duplication
|
||||
self.assertIn("Found 3 matching pair(s)", stdout)
|
||||
|
||||
def test_document_deletion(self) -> None:
|
||||
"""
|
||||
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
|
||||
stdout, _ = self.call_command(
|
||||
"--delete",
|
||||
"--yes",
|
||||
"--no-progress-bar",
|
||||
"--processes",
|
||||
"1",
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
"The command is configured to delete documents. Use with caution",
|
||||
stdout,
|
||||
)
|
||||
self.assertRegex(stdout, self.MSG_REGEX)
|
||||
self.assertIn("Deleting 1 documents based on ratio matches", stdout)
|
||||
self.assertIn("Delete Mode", stdout)
|
||||
self.assertIn("Found 1 matching pair(s)", stdout)
|
||||
self.assertIn("Deleting 1 document(s)", stdout)
|
||||
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self.assertIsNotNone(Document.objects.get(pk=1))
|
||||
self.assertIsNotNone(Document.objects.get(pk=2))
|
||||
|
||||
def test_document_deletion_cancelled(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- 3 documents exist
|
||||
- Document 1 to document 3 has a similarity over 85.0
|
||||
WHEN:
|
||||
- Command is called with --delete but user answers "n" at the prompt
|
||||
THEN:
|
||||
- No documents are deleted
|
||||
"""
|
||||
DocumentFactory(content="first document scanned by bob")
|
||||
DocumentFactory(content="second document scanned by alice")
|
||||
DocumentFactory(content="first document scanned by pete")
|
||||
|
||||
self.assertEqual(Document.objects.count(), 3)
|
||||
|
||||
with patch("builtins.input", return_value="n"):
|
||||
stdout, _ = self.call_command(
|
||||
"--delete",
|
||||
"--no-progress-bar",
|
||||
"--processes",
|
||||
"1",
|
||||
)
|
||||
|
||||
self.assertIn("Deletion cancelled", stdout)
|
||||
self.assertEqual(Document.objects.count(), 3)
|
||||
|
||||
def test_empty_content(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
self.assertIn("No matches found", stdout)
|
||||
self.assertIn("No duplicate documents found", stdout)
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.parametrize(
|
||||
("content_a", "content_b"),
|
||||
[
|
||||
pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"), # 88-92%
|
||||
pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"), # 92-97%
|
||||
pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"), # ≥97%
|
||||
],
|
||||
)
|
||||
def test_similarity_color_band(content_a: str, content_b: str) -> None:
|
||||
"""Each parametrized case exercises one color branch in _render_results."""
|
||||
DocumentFactory(content=content_a)
|
||||
DocumentFactory(content=content_b)
|
||||
stdout = StringIO()
|
||||
call_command(
|
||||
"document_fuzzy_match",
|
||||
"--no-progress-bar",
|
||||
"--processes",
|
||||
"1",
|
||||
stdout=stdout,
|
||||
skip_checks=True,
|
||||
)
|
||||
assert "Found 1 matching pair(s)" in stdout.getvalue()
|
||||
|
||||
Reference in New Issue
Block a user