Feature: Document fuzzy match improvements (#12579)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-04-15 13:59:43 -07:00
committed by GitHub
parent 2b4c1fe20d
commit 2fd1a1cf3a
2 changed files with 236 additions and 90 deletions

View File

@@ -1,8 +1,12 @@
import dataclasses
from itertools import combinations
from typing import Final
import rapidfuzz
from django.core.management import CommandError
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from documents.management.commands.base import PaperlessCommand
from documents.models import Document
@@ -10,8 +14,11 @@ from documents.models import Document
@dataclasses.dataclass(frozen=True, slots=True)
class _WorkPackage:
first_doc: Document
second_doc: Document
pk_a: int
content_a: str
pk_b: int
content_b: str
score_cutoff: float
@dataclasses.dataclass(frozen=True, slots=True)
@@ -20,21 +27,20 @@ class _WorkResult:
doc_two_pk: int
ratio: float
def __lt__(self, other: "_WorkResult") -> bool:
return self.doc_one_pk < other.doc_one_pk
def _process_and_match(work: _WorkPackage) -> _WorkResult:
"""
Does basic processing of document content, gets the basic ratio
and returns the result package.
Process document content and compute the fuzzy ratio.
score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold.
"""
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
match = rapidfuzz.fuzz.ratio(first_string, second_string)
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
first_string = rapidfuzz.utils.default_process(work.content_a)
second_string = rapidfuzz.utils.default_process(work.content_b)
ratio = rapidfuzz.fuzz.ratio(
first_string,
second_string,
score_cutoff=work.score_cutoff,
)
return _WorkResult(work.pk_a, work.pk_b, ratio)
class Command(PaperlessCommand):
@@ -57,78 +63,169 @@ class Command(PaperlessCommand):
action="store_true",
help="If set, one document of matches above the ratio WILL BE DELETED",
)
parser.add_argument(
"--yes",
default=False,
action="store_true",
help="Skip the confirmation prompt when used with --delete",
)
def _render_results(
self,
matches: list[_WorkResult],
*,
opt_ratio: float,
do_delete: bool,
) -> list[int]:
"""Render match results as a Rich table. Returns list of PKs to delete."""
if not matches:
self.console.print(
Panel(
"[green]No duplicate documents found.[/green]",
title="Fuzzy Match",
border_style="green",
),
)
return []
# Fetch titles for matched documents in a single query.
all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)}
titles: dict[int, str] = dict(
Document.objects.filter(pk__in=all_pks)
.only("pk", "title")
.values_list("pk", "title"),
)
table = Table(
title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)",
show_lines=True,
title_style="bold",
)
table.add_column("#", style="dim", width=4, no_wrap=True)
table.add_column("Document A", min_width=24)
table.add_column("Document B", min_width=24)
table.add_column("Similarity", width=11, justify="right")
maybe_delete_ids: list[int] = []
for i, match_result in enumerate(matches, 1):
pk_a = match_result.doc_one_pk
pk_b = match_result.doc_two_pk
ratio = match_result.ratio
if ratio >= 97.0:
ratio_style = "bold red"
elif ratio >= 92.0:
ratio_style = "red"
elif ratio >= 88.0:
ratio_style = "yellow"
else:
ratio_style = "dim"
table.add_row(
str(i),
f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}",
f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}",
Text(f"{ratio:.1f}%", style=ratio_style),
)
maybe_delete_ids.append(pk_b)
self.console.print(table)
summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)."
if do_delete:
summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted."
self.console.print(summary)
return maybe_delete_ids
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
if options["delete"]:
self.stdout.write(
self.style.WARNING(
"The command is configured to delete documents. Use with caution",
),
)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
for first_doc in all_docs:
for second_doc in all_docs:
if first_doc.pk == second_doc.pk:
continue
if first_doc.content.strip() == "" or second_doc.content.strip() == "":
continue
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
continue
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
work_pkgs.append(_WorkPackage(first_doc, second_doc))
results: list[_WorkResult] = []
if self.process_count == 1:
for work in self.track(work_pkgs, description="Matching..."):
results.append(_process_and_match(work))
else: # pragma: no cover
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif proc_result.result is not None:
results.append(proc_result.result)
messages: list[str] = []
maybe_delete_ids: list[int] = []
for match_result in sorted(results):
if match_result.ratio >= opt_ratio:
messages.append(
self.style.NOTICE(
f"Document {match_result.doc_one_pk} fuzzy match"
f" to {match_result.doc_two_pk}"
f" (confidence {match_result.ratio:.3f})\n",
),
)
maybe_delete_ids.append(match_result.doc_two_pk)
if len(messages) == 0:
messages.append(self.style.SUCCESS("No matches found\n"))
self.stdout.writelines(messages)
if options["delete"]:
self.stdout.write(
self.style.NOTICE(
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
self.console.print(
Panel(
"[bold yellow]WARNING:[/bold yellow] This run is configured to delete"
" documents. One document from each matched pair WILL BE PERMANENTLY DELETED.",
title="Delete Mode",
border_style="red",
),
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()
# Load only the fields we need -- avoids fetching title, archive_checksum, etc.
slim_docs: list[tuple[int, str]] = list(
Document.objects.only("id", "content")
.order_by("id")
.values_list("id", "content"),
)
# combinations() generates each unique pair exactly once -- no checked_pairs set needed.
# The total is computed cheaply so the progress bar can start immediately without
# materialising all pairs up front (n*(n-1)/2 can be hundreds of thousands).
n = len(slim_docs)
total_pairs = n * (n - 1) // 2
def _work_gen():
for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2):
if ca.strip() and cb.strip():
yield _WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
def _iter_matches():
if self.process_count == 1:
for work in self.track(
_work_gen(),
description="Matching...",
total=total_pairs,
):
result = _process_and_match(work)
if result.ratio >= opt_ratio:
yield result
else: # pragma: no cover
work_pkgs = list(_work_gen())
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif (
proc_result.result is not None
and proc_result.result.ratio >= opt_ratio
):
yield proc_result.result
matches = sorted(_iter_matches(), key=lambda m: m.ratio, reverse=True)
maybe_delete_ids = self._render_results(
matches,
opt_ratio=opt_ratio,
do_delete=options["delete"],
)
if options["delete"] and maybe_delete_ids:
confirmed = options["yes"]
if not confirmed:
self.console.print(
f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? "
"[bold]\\[y/N][/bold] ",
end="",
)
answer = input().strip().lower()
confirmed = answer in {"y", "yes"}
if confirmed:
self.console.print(
f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]",
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()
self.console.print("[green]Done.[/green]")
else:
self.console.print("[yellow]Deletion cancelled.[/yellow]")

View File

@@ -1,4 +1,5 @@
from io import StringIO
from unittest.mock import patch
import pytest
from django.core.management import CommandError
@@ -6,12 +7,11 @@ from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
from documents.tests.factories import DocumentFactory
@pytest.mark.management
class TestFuzzyMatchCommand(TestCase):
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)
def test_with_matches(self) -> None:
"""
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command("--processes", "1")
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Found 1 matching pair(s)", stdout)
def test_with_3_matches(self) -> None:
"""
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
filename="final_test.pdf",
)
stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
lines = [x.strip() for x in stdout.splitlines() if x.strip()]
self.assertEqual(len(lines), 3)
for line in lines:
self.assertRegex(line, self.MSG_REGEX)
# 3 docs -> 3 unique pairs; summary confirms count and no duplication
self.assertIn("Found 3 matching pair(s)", stdout)
def test_document_deletion(self) -> None:
"""
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):
stdout, _ = self.call_command(
"--delete",
"--yes",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn(
"The command is configured to delete documents. Use with caution",
stdout,
)
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Deleting 1 documents based on ratio matches", stdout)
self.assertIn("Delete Mode", stdout)
self.assertIn("Found 1 matching pair(s)", stdout)
self.assertIn("Deleting 1 document(s)", stdout)
self.assertEqual(Document.objects.count(), 2)
self.assertIsNotNone(Document.objects.get(pk=1))
self.assertIsNotNone(Document.objects.get(pk=2))
def test_document_deletion_cancelled(self) -> None:
"""
GIVEN:
- 3 documents exist
- Document 1 to document 3 has a similarity over 85.0
WHEN:
- Command is called with --delete but user answers "n" at the prompt
THEN:
- No documents are deleted
"""
DocumentFactory(content="first document scanned by bob")
DocumentFactory(content="second document scanned by alice")
DocumentFactory(content="first document scanned by pete")
self.assertEqual(Document.objects.count(), 3)
with patch("builtins.input", return_value="n"):
stdout, _ = self.call_command(
"--delete",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn("Deletion cancelled", stdout)
self.assertEqual(Document.objects.count(), 3)
def test_empty_content(self) -> None:
"""
GIVEN:
@@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)
@pytest.mark.management
@pytest.mark.django_db
@pytest.mark.parametrize(
("content_a", "content_b"),
[
pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"), # 88-92%
pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"), # 92-97%
pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"), # ≥97%
],
)
def test_similarity_color_band(content_a: str, content_b: str) -> None:
"""Each parametrized case exercises one color branch in _render_results."""
DocumentFactory(content=content_a)
DocumentFactory(content=content_b)
stdout = StringIO()
call_command(
"document_fuzzy_match",
"--no-progress-bar",
"--processes",
"1",
stdout=stdout,
skip_checks=True,
)
assert "Found 1 matching pair(s)" in stdout.getvalue()