mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-18 03:14:19 +00:00
a020f64d08
* Chore(beta): add sqlite-vec 0.1.9 dependency Pinned exactly: the 0.1.9 wheels carry no baked SIMD flags (safe on pre-AVX2 CPUs, the point of this migration); the 0.1.10 alphas bake -mavx and would reintroduce the #12970 crash class. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Test(beta): port vector store tests to sqlite-vec backend Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): switch AI vector store from LanceDB to sqlite-vec Fixes the non-AVX2 SIGILL class (#12970) at the root: lancedb is no longer imported. sqlite-vec 0.1.9 wheels carry no baked SIMD, vec0 metadata columns give parameterized EQ/IN filtering, WAL preserves the lock-free-reader model, and compact() rebuilds the table because vec0 DELETEs never reclaim space. Implementation notes vs. the Task 3A draft: - compact() uses a file-swap approach (new db file + Path.replace) rather than ALTER TABLE RENAME, which does not cascade to shadow tables in sqlite-vec 0.1.9 (upstream limitation). - Bloat is tracked via a cumulative total_inserts counter in index_meta because the _rowids shadow table does not accumulate deleted rows in 0.1.9 (contrary to the design doc assumption from #54). - None distances from the zero-vector cosine edge case are mapped to similarity 0.0 rather than raising TypeError. - Test suite updated accordingly: _bloat_ratio reads index_meta instead of _rowids; seed collision in force-compact test fixed (seed=100.0). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): wire indexing pipeline to the sqlite-vec store Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): move filename/storage path/ASN to node metadata Same treatment as title/tags/correspondent in #12944: excluded from the embedded text, visible to the LLM via metadata prepend. Changes embedded text for every document, so it ships inside the sqlite-vec transition, whose forced rebuild re-embeds everything anyway. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Test(beta): cover legacy LanceDB index cleanup and forced rebuild Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): drop lancedb dependency Fixes #12970: the package whose wheels SIGILL on non-AVX2 CPUs is no longer installed at all. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): partial pyrefly cleanup on sqlite-vec vector store - Add MetadataFilter import and isinstance guard in _build_where() - Add query_embedding None guard in query() - Fix dict.get() type-checker ambiguity in get_configured_model_name() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): drop automatic LanceDB index cleanup on startup Leave legacy Lance directory removal to the user rather than deleting it automatically on first run. Beta policy: user is expected to do a clean re-embed anyway; no need for the system to silently delete their data. Remove _cleanup_legacy_lance_index(), the forced-rebuild path that called it, and the associated tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): ruff format pass on sqlite-vec AI files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Removes the benchmarking file * Try to resolve or silence some semgrep. But we're using SQL here, not an ORM and we control the inputs, not users * Enhancement(beta): add schema migration machinery to sqlite-vec vector store Adds versioned schema migration support modelled after PR #12968's LanceDB approach, adapted for sqlite-vec's file-swap compaction pattern. - SCHEMA_VERSION = 1 written to index_meta at table creation and preserved through compact() - Migration dataclass with from_version, to_version, kind ("structural" or "re-embed"), description, and an optional apply(src, dst, dim) callable - MIGRATIONS registry (empty at v1 baseline); add entries and bump SCHEMA_VERSION when the schema changes - check_and_run_migrations(): structural migrations run via the same file-swap as compact() (no re-embed); re-embed migrations return True so the caller forces a full rebuild - update_llm_index() calls check_and_run_migrations() under the write lock before any indexing work Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): deduplicate vector store internals via helper methods Extract three helpers to remove copy-paste between compact() and _run_structural_migration(): - _meta_set_on(conn, key, value): static upsert into any connection's index_meta; _meta_set() now delegates to it - _create_vec_table(conn, dim): CREATE VIRTUAL TABLE DDL (carries the nosemgrep annotation) - _swap_in_compact(compact_path, db_path): close/replace/reconnect sequence used by both file-swap callers Also normalises compact() error-path cleanup to unlink(missing_ok=True). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Adds equality test and no covers some defensive error handling stuff * Ensures an embed migration stops the migration chain, just in case * Silence one kind right but not really semgrep * Trims dead assignment * Fix(beta): address Copilot review on sqlite-vec vector store Three findings from the PR review: - compact() failure cleanup now unlinks the temporary .compact-wal and .compact-shm files, matching _run_structural_migration(); previously only the main .compact file was removed. - _build_where() fails closed (1 = 0) when filters are requested but none translate, instead of emitting "()" which is invalid SQL; filters scope document access, so an empty translation must match no rows. - Drop the unused table_name constructor parameter (all SQL hardcodes DEFAULT_TABLE_NAME) and its callers in indexing.py. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * Enhancement(beta): guard sqlite-vec compaction swap against concurrent readers The compaction/migration file swap replaces the database via os.replace, but the -wal/-shm files are keyed by path, not inode. A reader holding an open connection across the swap leaves the old WAL aliased onto the new file; a subsequent write then corrupts the database (reproduced via PRAGMA integrity_check). Add a cross-process read/write lock (filelock.ReadWriteLock) over the index: - read_store() holds it shared for the whole connection lifetime (and closes the connection on exit); concurrent readers do not block. - compaction and the migration check run under an exclusive lock that drains readers, and skip with an info log on Timeout (maintenance op, retries next run). - Normal writes are untouched: WAL gives reader/writer concurrency and LLM_INDEX_LOCK still serializes writers, so they never block readers. load_or_build_index() now takes the store from the caller's read_store() so the lock and connection span the whole retrieval; chat holds it across the streamed response. Two new settings: LLM_INDEX_RWLOCK and LLM_INDEX_COMPACTION_LOCK_TIMEOUT. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * Ensures the store alays cleans up SQLite connections for any operations, even on errors --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
349 lines
9.6 KiB
TOML
349 lines
9.6 KiB
TOML
[project]
|
|
name = "paperless-ngx"
|
|
version = "3.0.0"
|
|
description = "A community-supported supercharged document management system: scan, index and archive all your physical documents"
|
|
readme = "README.md"
|
|
requires-python = ">=3.11"
|
|
classifiers = [
|
|
"Programming Language :: Python :: 3 :: Only",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Programming Language :: Python :: 3.13",
|
|
"Programming Language :: Python :: 3.14",
|
|
]
|
|
# TODO: Move certain things to groups and then utilize that further
|
|
# This will allow testing to not install a webserver, mysql, etc
|
|
dependencies = [
|
|
"azure-ai-documentintelligence>=1.0.2",
|
|
"babel>=2.17",
|
|
"bleach~=6.3.0",
|
|
"celery[redis]~=5.6.2",
|
|
"channels~=4.2",
|
|
"channels-redis~=4.2",
|
|
"concurrent-log-handler~=0.9.25",
|
|
"dateparser~=1.2",
|
|
# WARNING: django does not use semver.
|
|
# Only patch versions are guaranteed to not introduce breaking changes.
|
|
"django~=5.2.13",
|
|
"django-allauth[mfa,socialaccount]~=65.16.0",
|
|
"django-auditlog~=3.4.1",
|
|
"django-cachalot~=2.9.0",
|
|
"django-compression-middleware~=0.5.0",
|
|
"django-cors-headers~=4.9.0",
|
|
"django-extensions~=4.1",
|
|
"django-filter~=25.1",
|
|
"django-guardian~=3.3.0",
|
|
"django-multiselectfield~=1.0.1",
|
|
"django-rich~=2.2.0",
|
|
"django-soft-delete~=1.0.18",
|
|
"django-treenode>=0.24",
|
|
"djangorestframework~=3.16",
|
|
"djangorestframework-guardian~=0.4.0",
|
|
"drf-spectacular~=0.28",
|
|
"drf-spectacular-sidecar~=2026.5.1",
|
|
"drf-writable-nested~=0.7.1",
|
|
"filelock~=3.29.0",
|
|
"flower~=2.0.1",
|
|
"gotenberg-client~=0.14.0",
|
|
"httpx-oauth~=0.16",
|
|
"ijson>=3.2",
|
|
"imap-tools~=1.13.0",
|
|
"jinja2~=3.1.5",
|
|
"langdetect~=1.0.9",
|
|
"llama-index-core>=0.14.21",
|
|
"llama-index-embeddings-huggingface>=0.6.1",
|
|
"llama-index-embeddings-ollama>=0.9",
|
|
"llama-index-embeddings-openai-like>=0.2.2",
|
|
"llama-index-llms-ollama>=0.9.1",
|
|
"llama-index-llms-openai-like>=0.7.1",
|
|
"nltk~=3.9.1",
|
|
"ocrmypdf~=17.4.2",
|
|
"openai>=2.32",
|
|
"pathvalidate~=3.3.1",
|
|
"pdf2image~=1.17.0",
|
|
"python-dateutil~=2.9.0",
|
|
"python-dotenv~=1.2.1",
|
|
"python-gnupg~=0.5.4",
|
|
"python-ipware~=3.0.0",
|
|
"python-magic~=0.4.27",
|
|
"rapidfuzz~=3.14.5",
|
|
"redis[hiredis]~=5.2.1",
|
|
"regex>=2026.4.4",
|
|
"scikit-learn~=1.8.0",
|
|
"sentence-transformers>=5.4.1",
|
|
"setproctitle~=1.3.4",
|
|
"sqlite-vec==0.1.9",
|
|
"tantivy~=0.26.0",
|
|
"tika-client~=0.11.0",
|
|
"torch~=2.11.0",
|
|
"watchfiles>=1.1.1",
|
|
"whitenoise~=6.11",
|
|
"zxing-cpp~=3.0.0",
|
|
]
|
|
[project.optional-dependencies]
|
|
mariadb = [
|
|
"mysqlclient~=2.2.7",
|
|
]
|
|
postgres = [
|
|
"psycopg[c,pool]==3.3",
|
|
# Direct dependency for proper resolution of the pre-built wheels
|
|
"psycopg-c==3.3",
|
|
"psycopg-pool==3.3",
|
|
]
|
|
webserver = [
|
|
"granian[uvloop]~=2.7.0",
|
|
]
|
|
|
|
[dependency-groups]
|
|
dev = [
|
|
{ include-group = "docs" },
|
|
{ include-group = "lint" },
|
|
{ include-group = "testing" },
|
|
]
|
|
docs = [
|
|
"zensical>=0.0.36",
|
|
]
|
|
lint = [
|
|
"prek~=0.3.10",
|
|
"ruff~=0.15.12",
|
|
]
|
|
testing = [
|
|
"daphne",
|
|
"factory-boy~=3.3.1",
|
|
"faker~=40.15.0",
|
|
"imagehash",
|
|
"pytest~=9.0.3",
|
|
"pytest-cov~=7.1.0",
|
|
"pytest-django~=4.12.0",
|
|
"pytest-env~=1.6.0",
|
|
"pytest-httpx",
|
|
"pytest-mock~=3.15.1",
|
|
# "pytest-randomly~=4.0.1",
|
|
"pytest-rerunfailures~=16.1",
|
|
"pytest-sugar",
|
|
"pytest-xdist~=3.8.0",
|
|
"time-machine>=2.13",
|
|
]
|
|
typing = [
|
|
"celery-types",
|
|
"django-filter-stubs",
|
|
"django-stubs[compatible-mypy]",
|
|
"djangorestframework-stubs[compatible-mypy]",
|
|
"lxml-stubs",
|
|
"microsoft-python-type-stubs @ git+https://github.com/microsoft/python-type-stubs.git",
|
|
"mypy",
|
|
"mypy-baseline",
|
|
"pyrefly",
|
|
"types-bleach",
|
|
"types-channels",
|
|
"types-colorama",
|
|
"types-dateparser",
|
|
"types-markdown",
|
|
"types-pygments",
|
|
"types-python-dateutil",
|
|
"types-pytz",
|
|
"types-redis",
|
|
"types-regex",
|
|
"types-setuptools"
|
|
]
|
|
|
|
[tool.uv]
|
|
required-version = ">=0.9.0"
|
|
environments = [
|
|
"sys_platform == 'darwin'",
|
|
"sys_platform == 'linux'",
|
|
]
|
|
package = false
|
|
[[tool.uv.index]]
|
|
name = "pytorch-cpu"
|
|
url = "https://download.pytorch.org/whl/cpu"
|
|
explicit = true
|
|
[tool.uv.sources]
|
|
# Markers are chosen to select these almost exclusively when building the Docker image
|
|
psycopg-c = [
|
|
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
|
|
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64' and python_version == '3.12'" },
|
|
]
|
|
torch = [
|
|
{ index = "pytorch-cpu" },
|
|
]
|
|
|
|
[tool.ruff]
|
|
target-version = "py311"
|
|
line-length = 88
|
|
src = [
|
|
"src",
|
|
]
|
|
respect-gitignore = true
|
|
# https://docs.astral.sh/ruff/settings/
|
|
fix = true
|
|
show-fixes = true
|
|
output-format = "grouped"
|
|
[tool.ruff.format]
|
|
line-ending = "lf"
|
|
[tool.ruff.lint]
|
|
# https://docs.astral.sh/ruff/rules/
|
|
extend-select = [
|
|
"COM", # https://docs.astral.sh/ruff/rules/#flake8-commas-com
|
|
"DJ", # https://docs.astral.sh/ruff/rules/#flake8-django-dj
|
|
"EXE", # https://docs.astral.sh/ruff/rules/#flake8-executable-exe
|
|
"FBT", # https://docs.astral.sh/ruff/rules/#flake8-boolean-trap-fbt
|
|
"FLY", # https://docs.astral.sh/ruff/rules/#flynt-fly
|
|
"G201", # https://docs.astral.sh/ruff/rules/#flake8-logging-format-g
|
|
"I", # https://docs.astral.sh/ruff/rules/#isort-i
|
|
"ICN", # https://docs.astral.sh/ruff/rules/#flake8-import-conventions-icn
|
|
"INP", # https://docs.astral.sh/ruff/rules/#flake8-no-pep420-inp
|
|
"ISC", # https://docs.astral.sh/ruff/rules/#flake8-implicit-str-concat-isc
|
|
"PIE", # https://docs.astral.sh/ruff/rules/#flake8-pie-pie
|
|
"PLC", # https://docs.astral.sh/ruff/rules/#pylint-pl
|
|
"PLE", # https://docs.astral.sh/ruff/rules/#pylint-pl
|
|
"PTH", # https://docs.astral.sh/ruff/rules/#flake8-use-pathlib-pth
|
|
"Q", # https://docs.astral.sh/ruff/rules/#flake8-quotes-q
|
|
"RSE", # https://docs.astral.sh/ruff/rules/#flake8-raise-rse
|
|
"RUF", # https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf
|
|
"SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim
|
|
"T20", # https://docs.astral.sh/ruff/rules/#flake8-print-t20
|
|
"TC", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tc
|
|
"TID", # https://docs.astral.sh/ruff/rules/#flake8-tidy-imports-tid
|
|
"UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up
|
|
"W", # https://docs.astral.sh/ruff/rules/#pycodestyle-e-w
|
|
]
|
|
ignore = [
|
|
"DJ001",
|
|
"PLC0415",
|
|
"RUF012",
|
|
"SIM105",
|
|
]
|
|
# Migrations
|
|
per-file-ignores."*/migrations/*.py" = [
|
|
"E501",
|
|
"SIM",
|
|
"T201",
|
|
]
|
|
# Testing
|
|
per-file-ignores."*/tests/*.py" = [
|
|
"E501",
|
|
"SIM117",
|
|
]
|
|
per-file-ignores.".github/scripts/*.py" = [
|
|
"E501",
|
|
"INP001",
|
|
"SIM117",
|
|
]
|
|
# Docker specific
|
|
per-file-ignores."docker/rootfs/usr/local/bin/wait-for-redis.py" = [
|
|
"INP001",
|
|
"T201",
|
|
]
|
|
per-file-ignores."docker/wait-for-redis.py" = [
|
|
"INP001",
|
|
"T201",
|
|
]
|
|
per-file-ignores."src/documents/models.py" = [
|
|
"SIM115",
|
|
]
|
|
isort.force-single-line = true
|
|
|
|
[tool.codespell]
|
|
write-changes = true
|
|
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
|
skip = """\
|
|
src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples\
|
|
/mail/*,src/documents/tests/samples/*,*.po,*.json\
|
|
"""
|
|
|
|
[tool.pyproject-fmt]
|
|
table_format = "long"
|
|
|
|
[tool.mypy]
|
|
mypy_path = "src"
|
|
plugins = [
|
|
"mypy_django_plugin.main",
|
|
"mypy_drf_plugin.main",
|
|
]
|
|
check_untyped_defs = true
|
|
disallow_any_generics = true
|
|
disallow_incomplete_defs = true
|
|
disallow_untyped_defs = true
|
|
warn_redundant_casts = true
|
|
warn_unused_ignores = true
|
|
|
|
[tool.pyrefly]
|
|
search-path = [ "src" ]
|
|
baseline = ".pyrefly-baseline.json"
|
|
python-platform = "linux"
|
|
|
|
[tool.django-stubs]
|
|
django_settings_module = "paperless.settings"
|
|
|
|
[tool.pytest]
|
|
minversion = "9.0"
|
|
pythonpath = [ "src" ]
|
|
strict_config = true
|
|
strict_markers = true
|
|
strict_parametrization_ids = true
|
|
strict_xfail = true
|
|
testpaths = [
|
|
"src/documents/tests/",
|
|
"src/paperless/tests/",
|
|
"src/paperless_mail/tests/",
|
|
"src/paperless_ai/tests",
|
|
]
|
|
addopts = [
|
|
"--pythonwarnings=all",
|
|
"--cov",
|
|
"--cov-report=html",
|
|
"--cov-report=xml",
|
|
"--numprocesses=auto",
|
|
"--maxprocesses=16",
|
|
"--dist=loadscope",
|
|
"--durations=50",
|
|
"--durations-min=0.5",
|
|
"--junitxml=junit.xml",
|
|
"-o",
|
|
"junit_family=legacy",
|
|
]
|
|
norecursedirs = [ "src/locale/", ".venv/", "src-ui/" ]
|
|
DJANGO_SETTINGS_MODULE = "paperless.settings"
|
|
markers = [
|
|
"live: Integration tests requiring external services (Gotenberg, Tika, nginx, etc)",
|
|
"nginx: Tests that make HTTP requests to the local nginx service",
|
|
"gotenberg: Tests requiring Gotenberg service",
|
|
"tika: Tests requiring Tika service",
|
|
"greenmail: Tests requiring Greenmail service",
|
|
"date_parsing: Tests which cover date parsing from content or filename",
|
|
"management: Tests which cover management commands/functionality",
|
|
"search: Tests for the Tantivy search backend",
|
|
"api: Tests for REST API endpoints",
|
|
]
|
|
|
|
[tool.pytest_env]
|
|
PAPERLESS_SECRET_KEY = "test-secret-key-do-not-use-in-production"
|
|
PAPERLESS_DISABLE_DBHANDLER = "true"
|
|
PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"
|
|
PAPERLESS_CHANNELS_BACKEND = "channels.layers.InMemoryChannelLayer"
|
|
# I don't think anything hits this, but just in case, basically infinite
|
|
PAPERLESS_TOKEN_THROTTLE_RATE = "1000/min"
|
|
|
|
[tool.coverage.report]
|
|
exclude_also = [
|
|
"if settings.AUDIT_LOG_ENABLED:",
|
|
"if AUDIT_LOG_ENABLED:",
|
|
"if TYPE_CHECKING:",
|
|
]
|
|
[tool.coverage.run]
|
|
source = [
|
|
"src/",
|
|
]
|
|
omit = [
|
|
"*/tests/*",
|
|
"manage.py",
|
|
"paperless/wsgi.py",
|
|
"paperless/auth.py",
|
|
]
|
|
|
|
[tool.mypy-baseline]
|
|
baseline_path = ".mypy-baseline.txt"
|
|
sort_baseline = true
|
|
ignore_categories = [ "note" ]
|