Compare commits

..

3 Commits

Author SHA1 Message Date
Trenton H
33a27167d5 refactor: clarify isolation_level intent and cache_size unit
Add comment explaining isolation_level is explicit even though Django
defaults to the same value, so the intent survives future Django changes.
Add KiB unit comment to cache_size=-8000.
Rename sqlite-init-command-override test to sqlite-options-override to
reflect that both init_command and transaction_mode are being overridden.
2026-04-14 16:22:45 -07:00
Trenton H
a9b6b403ac docs: update PAPERLESS_DB_OPTIONS for comma separator and new engine defaults
Change all examples from semicolon to comma-delimited format. Add notes
for SQLite WAL defaults (with override example) and MariaDB READ COMMITTED
binlog_format=ROW prerequisite.
2026-04-14 15:36:30 -07:00
Trenton H
b595da9221 feat: add database tuning defaults to parse_db_settings
Change PAPERLESS_DB_OPTIONS separator from ';' to ',' so SQLite
init_command values (which use ';' between PRAGMAs) can be overridden
without escaping.

SQLite: WAL journal mode, NORMAL synchronous, 5s busy timeout, memory
temp store, 128MB mmap, 64MB journal size limit, 8MB cache, and
IMMEDIATE transaction mode for correct busy_timeout behaviour.
PostgreSQL: application_name=paperless-ngx for pg_stat_activity.
MariaDB: isolation_level=read committed to eliminate gap locking.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 15:31:10 -07:00
6 changed files with 190 additions and 221 deletions

View File

@@ -101,7 +101,7 @@ and `mariadb`.
#### [`PAPERLESS_DB_OPTIONS=<options>`](#PAPERLESS_DB_OPTIONS) {#PAPERLESS_DB_OPTIONS}
: Advanced database connection options as a semicolon-delimited key-value string.
: Advanced database connection options as a comma-delimited key-value string.
Keys and values are separated by `=`. Dot-notation produces nested option
dictionaries; for example, `pool.max_size=20` sets
`OPTIONS["pool"]["max_size"] = 20`.
@@ -123,18 +123,36 @@ dictionaries; for example, `pool.max_size=20` sets
to handle all pool connections across all workers:
`(web_workers + celery_workers) * pool.max_size + safety_margin`.
!!! note "SQLite defaults"
SQLite connections are pre-configured with WAL journal mode, optimised
synchronous and cache settings, and a 5-second busy timeout. These defaults
suit most deployments. To override `init_command`, use `;` between PRAGMAs
within the value and `,` between options:
```bash
PAPERLESS_DB_OPTIONS="init_command=PRAGMA journal_mode=DELETE;PRAGMA synchronous=FULL,transaction_mode=DEFERRED"
```
!!! note "MariaDB: READ COMMITTED isolation level"
MariaDB connections default to `READ COMMITTED` isolation level, which
eliminates gap locking and reduces deadlock frequency. If binary logging is
enabled on your MariaDB server, this requires `binlog_format=ROW` (the
default for most managed MariaDB instances). Statement-based replication is
not compatible with `READ COMMITTED`.
**Examples:**
```bash title="PostgreSQL: require SSL, set a custom CA certificate, and limit the pool size"
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=5"
PAPERLESS_DB_OPTIONS="sslmode=require,sslrootcert=/certs/ca.pem,pool.max_size=5"
```
```bash title="MariaDB: require SSL with a custom CA certificate"
PAPERLESS_DB_OPTIONS="ssl_mode=REQUIRED;ssl.ca=/certs/ca.pem"
PAPERLESS_DB_OPTIONS="ssl_mode=REQUIRED,ssl.ca=/certs/ca.pem"
```
```bash title="SQLite: set a busy timeout of 30 seconds"
# PostgreSQL: set a connection timeout
```bash title="PostgreSQL or MariaDB: set a connection timeout"
PAPERLESS_DB_OPTIONS="connect_timeout=10"
```

View File

@@ -120,7 +120,7 @@ Users with any of the deprecated variables set should migrate to `PAPERLESS_DB_O
Multiple options are combined in a single value:
```bash
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
PAPERLESS_DB_OPTIONS="sslmode=require,sslrootcert=/certs/ca.pem,pool.max_size=10"
```
## OCR and Archive File Generation Settings

View File

@@ -1,12 +1,8 @@
import dataclasses
from itertools import combinations
from typing import Final
import rapidfuzz
from django.core.management import CommandError
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from documents.management.commands.base import PaperlessCommand
from documents.models import Document
@@ -14,11 +10,8 @@ from documents.models import Document
@dataclasses.dataclass(frozen=True, slots=True)
class _WorkPackage:
pk_a: int
content_a: str
pk_b: int
content_b: str
score_cutoff: float
first_doc: Document
second_doc: Document
@dataclasses.dataclass(frozen=True, slots=True)
@@ -33,17 +26,15 @@ class _WorkResult:
def _process_and_match(work: _WorkPackage) -> _WorkResult:
"""
Process document content and compute the fuzzy ratio.
score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold.
Does basic processing of document content, gets the basic ratio
and returns the result package.
"""
first_string = rapidfuzz.utils.default_process(work.content_a)
second_string = rapidfuzz.utils.default_process(work.content_b)
ratio = rapidfuzz.fuzz.ratio(
first_string,
second_string,
score_cutoff=work.score_cutoff,
)
return _WorkResult(work.pk_a, work.pk_b, ratio)
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
match = rapidfuzz.fuzz.ratio(first_string, second_string)
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(PaperlessCommand):
@@ -66,160 +57,78 @@ class Command(PaperlessCommand):
action="store_true",
help="If set, one document of matches above the ratio WILL BE DELETED",
)
parser.add_argument(
"--yes",
default=False,
action="store_true",
help="Skip the confirmation prompt when used with --delete",
)
def _render_results(
self,
matches: list[_WorkResult],
*,
opt_ratio: float,
do_delete: bool,
) -> list[int]:
"""Render match results as a Rich table. Returns list of PKs to delete."""
if not matches:
self.console.print(
Panel(
"[green]No duplicate documents found.[/green]",
title="Fuzzy Match",
border_style="green",
),
)
return []
# Fetch titles for matched documents in a single query.
all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)}
titles: dict[int, str] = dict(
Document.objects.filter(pk__in=all_pks)
.only("pk", "title")
.values_list("pk", "title"),
)
table = Table(
title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)",
show_lines=True,
title_style="bold",
)
table.add_column("#", style="dim", width=4, no_wrap=True)
table.add_column("Document A", min_width=24)
table.add_column("Document B", min_width=24)
table.add_column("Similarity", width=11, justify="right")
maybe_delete_ids: list[int] = []
for i, match_result in enumerate(matches, 1):
pk_a = match_result.doc_one_pk
pk_b = match_result.doc_two_pk
ratio = match_result.ratio
if ratio >= 97.0:
ratio_style = "bold red"
elif ratio >= 92.0:
ratio_style = "red"
elif ratio >= 88.0:
ratio_style = "yellow"
else:
ratio_style = "dim"
table.add_row(
str(i),
f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}",
f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}",
Text(f"{ratio:.1f}%", style=ratio_style),
)
maybe_delete_ids.append(pk_b)
self.console.print(table)
summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)."
if do_delete:
summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted."
self.console.print(summary)
return maybe_delete_ids
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
if options["delete"]:
self.stdout.write(
self.style.WARNING(
"The command is configured to delete documents. Use with caution",
),
)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
for first_doc in all_docs:
for second_doc in all_docs:
if first_doc.pk == second_doc.pk:
continue
if first_doc.content.strip() == "" or second_doc.content.strip() == "":
continue
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
continue
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
work_pkgs.append(_WorkPackage(first_doc, second_doc))
results: list[_WorkResult] = []
if self.process_count == 1:
for work in self.track(work_pkgs, description="Matching..."):
results.append(_process_and_match(work))
else: # pragma: no cover
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif proc_result.result is not None:
results.append(proc_result.result)
messages: list[str] = []
maybe_delete_ids: list[int] = []
for match_result in sorted(results):
if match_result.ratio >= opt_ratio:
messages.append(
self.style.NOTICE(
f"Document {match_result.doc_one_pk} fuzzy match"
f" to {match_result.doc_two_pk}"
f" (confidence {match_result.ratio:.3f})\n",
),
)
maybe_delete_ids.append(match_result.doc_two_pk)
if len(messages) == 0:
messages.append(self.style.SUCCESS("No matches found\n"))
self.stdout.writelines(messages)
if options["delete"]:
self.console.print(
Panel(
"[bold yellow]WARNING:[/bold yellow] This run is configured to delete"
" documents. One document from each matched pair WILL BE PERMANENTLY DELETED.",
title="Delete Mode",
border_style="red",
self.stdout.write(
self.style.NOTICE(
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
),
)
# Load only the fields we need -- avoids fetching title, archive_checksum, etc.
slim_docs: list[tuple[int, str]] = list(
Document.objects.only("id", "content")
.order_by("id")
.values_list("id", "content"),
)
# combinations() generates each unique pair exactly once -- no checked_pairs set needed.
work_pkgs: list[_WorkPackage] = [
_WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2)
if ca.strip() and cb.strip()
]
def _iter_matches():
if self.process_count == 1:
for work in self.track(work_pkgs, description="Matching..."):
result = _process_and_match(work)
if result.ratio >= opt_ratio:
yield result
else: # pragma: no cover
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif (
proc_result.result is not None
and proc_result.result.ratio >= opt_ratio
):
yield proc_result.result
matches = sorted(_iter_matches())
maybe_delete_ids = self._render_results(
matches,
opt_ratio=opt_ratio,
do_delete=options["delete"],
)
if options["delete"] and maybe_delete_ids:
confirmed = options["yes"]
if not confirmed:
self.console.print(
f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? "
"[bold]\\[y/N][/bold] ",
end="",
)
answer = input().strip().lower()
confirmed = answer in {"y", "yes"}
if confirmed:
self.console.print(
f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]",
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()
self.console.print("[green]Done.[/green]")
else:
self.console.print("[yellow]Deletion cancelled.[/yellow]")
Document.objects.filter(pk__in=maybe_delete_ids).delete()

View File

@@ -1,5 +1,4 @@
from io import StringIO
from unittest.mock import patch
import pytest
from django.core.management import CommandError
@@ -7,11 +6,12 @@ from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
from documents.tests.factories import DocumentFactory
@pytest.mark.management
class TestFuzzyMatchCommand(TestCase):
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No duplicate documents found", stdout)
self.assertIn("No matches found", stdout)
def test_with_matches(self) -> None:
"""
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command("--processes", "1")
self.assertIn("Found 1 matching pair(s)", stdout)
self.assertRegex(stdout, self.MSG_REGEX)
def test_with_3_matches(self) -> None:
"""
@@ -142,8 +142,10 @@ class TestFuzzyMatchCommand(TestCase):
filename="final_test.pdf",
)
stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
# 3 docs -> 3 unique pairs; summary confirms count and no duplication
self.assertIn("Found 3 matching pair(s)", stdout)
lines = [x.strip() for x in stdout.splitlines() if x.strip()]
self.assertEqual(len(lines), 3)
for line in lines:
self.assertRegex(line, self.MSG_REGEX)
def test_document_deletion(self) -> None:
"""
@@ -184,47 +186,22 @@ class TestFuzzyMatchCommand(TestCase):
stdout, _ = self.call_command(
"--delete",
"--yes",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn("Delete Mode", stdout)
self.assertIn("Found 1 matching pair(s)", stdout)
self.assertIn("Deleting 1 document(s)", stdout)
self.assertIn(
"The command is configured to delete documents. Use with caution",
stdout,
)
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Deleting 1 documents based on ratio matches", stdout)
self.assertEqual(Document.objects.count(), 2)
self.assertIsNotNone(Document.objects.get(pk=1))
self.assertIsNotNone(Document.objects.get(pk=2))
def test_document_deletion_cancelled(self) -> None:
"""
GIVEN:
- 3 documents exist
- Document 1 to document 3 has a similarity over 85.0
WHEN:
- Command is called with --delete but user answers "n" at the prompt
THEN:
- No documents are deleted
"""
DocumentFactory(content="first document scanned by bob")
DocumentFactory(content="second document scanned by alice")
DocumentFactory(content="first document scanned by pete")
self.assertEqual(Document.objects.count(), 3)
with patch("builtins.input", return_value="n"):
stdout, _ = self.call_command(
"--delete",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn("Deletion cancelled", stdout)
self.assertEqual(Document.objects.count(), 3)
def test_empty_content(self) -> None:
"""
GIVEN:
@@ -249,4 +226,4 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No duplicate documents found", stdout)
self.assertIn("No matches found", stdout)

View File

@@ -224,7 +224,23 @@ def parse_db_settings(data_dir: Path) -> dict[str, dict[str, Any]]:
"ENGINE": "django.db.backends.sqlite3",
"NAME": str((data_dir / "db.sqlite3").resolve()),
}
base_options = {}
base_options = {
# Django splits init_command on ";" and calls conn.execute()
# once per statement, so multiple PRAGMAs work correctly.
# foreign_keys is omitted — Django sets it natively.
"init_command": (
"PRAGMA journal_mode=WAL;"
"PRAGMA synchronous=NORMAL;"
"PRAGMA busy_timeout=5000;"
"PRAGMA temp_store=MEMORY;"
"PRAGMA mmap_size=134217728;"
"PRAGMA journal_size_limit=67108864;"
"PRAGMA cache_size=-8000" # negative = KiB; -8000 ≈ 8 MB
),
# IMMEDIATE acquires the write lock at BEGIN, ensuring
# busy_timeout is respected from the start of the transaction.
"transaction_mode": "IMMEDIATE",
}
case "postgresql":
db_config = {
@@ -240,6 +256,7 @@ def parse_db_settings(data_dir: Path) -> dict[str, dict[str, Any]]:
"sslrootcert": os.getenv("PAPERLESS_DBSSLROOTCERT"),
"sslcert": os.getenv("PAPERLESS_DBSSLCERT"),
"sslkey": os.getenv("PAPERLESS_DBSSLKEY"),
"application_name": "paperless-ngx",
}
if (pool_size := get_int_from_env("PAPERLESS_DB_POOLSIZE")) is not None:
@@ -267,6 +284,12 @@ def parse_db_settings(data_dir: Path) -> dict[str, dict[str, Any]]:
"cert": os.getenv("PAPERLESS_DBSSLCERT"),
"key": os.getenv("PAPERLESS_DBSSLKEY"),
},
# READ COMMITTED eliminates gap locking and reduces deadlocks.
# Django also defaults to "read committed" for MySQL/MariaDB, but
# we set it explicitly so the intent is clear and survives any
# future changes to Django's default.
# Requires binlog_format=ROW if binary logging is enabled.
"isolation_level": "read committed",
}
case _: # pragma: no cover
raise NotImplementedError(engine)
@@ -287,7 +310,7 @@ def parse_db_settings(data_dir: Path) -> dict[str, dict[str, Any]]:
db_config["OPTIONS"] = parse_dict_from_str(
os.getenv("PAPERLESS_DB_OPTIONS"),
defaults=base_options,
separator=";",
separator=",",
type_map={
# SQLite options
"timeout": int,

View File

@@ -296,8 +296,19 @@ class TestParseDbSettings:
{
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": None, # Will be replaced with tmp_path
"OPTIONS": {},
"NAME": None, # replaced with tmp_path in test body
"OPTIONS": {
"init_command": (
"PRAGMA journal_mode=WAL;"
"PRAGMA synchronous=NORMAL;"
"PRAGMA busy_timeout=5000;"
"PRAGMA temp_store=MEMORY;"
"PRAGMA mmap_size=134217728;"
"PRAGMA journal_size_limit=67108864;"
"PRAGMA cache_size=-8000"
),
"transaction_mode": "IMMEDIATE",
},
},
},
id="default-sqlite",
@@ -310,14 +321,41 @@ class TestParseDbSettings:
{
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": None, # Will be replaced with tmp_path
"NAME": None,
"OPTIONS": {
"init_command": (
"PRAGMA journal_mode=WAL;"
"PRAGMA synchronous=NORMAL;"
"PRAGMA busy_timeout=5000;"
"PRAGMA temp_store=MEMORY;"
"PRAGMA mmap_size=134217728;"
"PRAGMA journal_size_limit=67108864;"
"PRAGMA cache_size=-8000"
),
"transaction_mode": "IMMEDIATE",
"timeout": 30,
},
},
},
id="sqlite-with-timeout-override",
),
pytest.param(
{
"PAPERLESS_DBENGINE": "sqlite",
"PAPERLESS_DB_OPTIONS": "init_command=PRAGMA journal_mode=DELETE;PRAGMA synchronous=FULL,transaction_mode=DEFERRED",
},
{
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": None,
"OPTIONS": {
"init_command": "PRAGMA journal_mode=DELETE;PRAGMA synchronous=FULL",
"transaction_mode": "DEFERRED",
},
},
},
id="sqlite-options-override",
),
pytest.param(
{
"PAPERLESS_DBENGINE": "postgresql",
@@ -335,6 +373,7 @@ class TestParseDbSettings:
"sslrootcert": None,
"sslcert": None,
"sslkey": None,
"application_name": "paperless-ngx",
},
},
},
@@ -348,7 +387,7 @@ class TestParseDbSettings:
"PAPERLESS_DBNAME": "customdb",
"PAPERLESS_DBUSER": "customuser",
"PAPERLESS_DBPASS": "custompass",
"PAPERLESS_DB_OPTIONS": "pool.max_size=50;pool.min_size=2;sslmode=require",
"PAPERLESS_DB_OPTIONS": "pool.max_size=50,pool.min_size=2,sslmode=require",
},
{
"default": {
@@ -363,6 +402,7 @@ class TestParseDbSettings:
"sslrootcert": None,
"sslcert": None,
"sslkey": None,
"application_name": "paperless-ngx",
"pool": {
"min_size": 2,
"max_size": 50,
@@ -390,6 +430,7 @@ class TestParseDbSettings:
"sslrootcert": None,
"sslcert": None,
"sslkey": None,
"application_name": "paperless-ngx",
"pool": {
"min_size": 1,
"max_size": 10,
@@ -419,6 +460,7 @@ class TestParseDbSettings:
"sslrootcert": "/certs/ca.crt",
"sslcert": None,
"sslkey": None,
"application_name": "paperless-ngx",
"connect_timeout": 30,
},
},
@@ -447,6 +489,7 @@ class TestParseDbSettings:
"cert": None,
"key": None,
},
"isolation_level": "read committed",
},
},
},
@@ -455,18 +498,17 @@ class TestParseDbSettings:
pytest.param(
{
"PAPERLESS_DBENGINE": "mariadb",
"PAPERLESS_DBHOST": "paperless-mariadb-host",
"PAPERLESS_DBPORT": "5555",
"PAPERLESS_DBHOST": "mariahost",
"PAPERLESS_DBNAME": "paperlessdb",
"PAPERLESS_DBUSER": "my-cool-user",
"PAPERLESS_DBPASS": "my-secure-password",
"PAPERLESS_DB_OPTIONS": "ssl.ca=/path/to/ca.pem;ssl_mode=REQUIRED",
"PAPERLESS_DB_OPTIONS": "ssl_mode=REQUIRED,ssl.ca=/path/to/ca.pem",
},
{
"default": {
"ENGINE": "django.db.backends.mysql",
"HOST": "paperless-mariadb-host",
"PORT": 5555,
"NAME": "paperless",
"HOST": "mariahost",
"NAME": "paperlessdb",
"USER": "my-cool-user",
"PASSWORD": "my-secure-password",
"OPTIONS": {
@@ -479,6 +521,7 @@ class TestParseDbSettings:
"cert": None,
"key": None,
},
"isolation_level": "read committed",
},
},
},
@@ -512,6 +555,7 @@ class TestParseDbSettings:
"key": "/certs/client.key",
},
"connect_timeout": 25,
"isolation_level": "read committed",
},
},
},
@@ -527,10 +571,8 @@ class TestParseDbSettings:
expected_database_settings: dict[str, dict],
) -> None:
"""Test various database configurations with defaults and overrides."""
# Clear environment and set test vars
mocker.patch.dict(os.environ, env_vars, clear=True)
# Update expected paths with actual tmp_path
if (
"default" in expected_database_settings
and expected_database_settings["default"]["NAME"] is None