mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-29 08:44:24 +00:00
a020f64d08
* Chore(beta): add sqlite-vec 0.1.9 dependency Pinned exactly: the 0.1.9 wheels carry no baked SIMD flags (safe on pre-AVX2 CPUs, the point of this migration); the 0.1.10 alphas bake -mavx and would reintroduce the #12970 crash class. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Test(beta): port vector store tests to sqlite-vec backend Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): switch AI vector store from LanceDB to sqlite-vec Fixes the non-AVX2 SIGILL class (#12970) at the root: lancedb is no longer imported. sqlite-vec 0.1.9 wheels carry no baked SIMD, vec0 metadata columns give parameterized EQ/IN filtering, WAL preserves the lock-free-reader model, and compact() rebuilds the table because vec0 DELETEs never reclaim space. Implementation notes vs. the Task 3A draft: - compact() uses a file-swap approach (new db file + Path.replace) rather than ALTER TABLE RENAME, which does not cascade to shadow tables in sqlite-vec 0.1.9 (upstream limitation). - Bloat is tracked via a cumulative total_inserts counter in index_meta because the _rowids shadow table does not accumulate deleted rows in 0.1.9 (contrary to the design doc assumption from #54). - None distances from the zero-vector cosine edge case are mapped to similarity 0.0 rather than raising TypeError. - Test suite updated accordingly: _bloat_ratio reads index_meta instead of _rowids; seed collision in force-compact test fixed (seed=100.0). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): wire indexing pipeline to the sqlite-vec store Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Enhancement(beta): move filename/storage path/ASN to node metadata Same treatment as title/tags/correspondent in #12944: excluded from the embedded text, visible to the LLM via metadata prepend. Changes embedded text for every document, so it ships inside the sqlite-vec transition, whose forced rebuild re-embeds everything anyway. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Test(beta): cover legacy LanceDB index cleanup and forced rebuild Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): drop lancedb dependency Fixes #12970: the package whose wheels SIGILL on non-AVX2 CPUs is no longer installed at all. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): partial pyrefly cleanup on sqlite-vec vector store - Add MetadataFilter import and isinstance guard in _build_where() - Add query_embedding None guard in query() - Fix dict.get() type-checker ambiguity in get_configured_model_name() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): drop automatic LanceDB index cleanup on startup Leave legacy Lance directory removal to the user rather than deleting it automatically on first run. Beta policy: user is expected to do a clean re-embed anyway; no need for the system to silently delete their data. Remove _cleanup_legacy_lance_index(), the forced-rebuild path that called it, and the associated tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): ruff format pass on sqlite-vec AI files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Removes the benchmarking file * Try to resolve or silence some semgrep. But we're using SQL here, not an ORM and we control the inputs, not users * Enhancement(beta): add schema migration machinery to sqlite-vec vector store Adds versioned schema migration support modelled after PR #12968's LanceDB approach, adapted for sqlite-vec's file-swap compaction pattern. - SCHEMA_VERSION = 1 written to index_meta at table creation and preserved through compact() - Migration dataclass with from_version, to_version, kind ("structural" or "re-embed"), description, and an optional apply(src, dst, dim) callable - MIGRATIONS registry (empty at v1 baseline); add entries and bump SCHEMA_VERSION when the schema changes - check_and_run_migrations(): structural migrations run via the same file-swap as compact() (no re-embed); re-embed migrations return True so the caller forces a full rebuild - update_llm_index() calls check_and_run_migrations() under the write lock before any indexing work Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore(beta): deduplicate vector store internals via helper methods Extract three helpers to remove copy-paste between compact() and _run_structural_migration(): - _meta_set_on(conn, key, value): static upsert into any connection's index_meta; _meta_set() now delegates to it - _create_vec_table(conn, dim): CREATE VIRTUAL TABLE DDL (carries the nosemgrep annotation) - _swap_in_compact(compact_path, db_path): close/replace/reconnect sequence used by both file-swap callers Also normalises compact() error-path cleanup to unlink(missing_ok=True). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Adds equality test and no covers some defensive error handling stuff * Ensures an embed migration stops the migration chain, just in case * Silence one kind right but not really semgrep * Trims dead assignment * Fix(beta): address Copilot review on sqlite-vec vector store Three findings from the PR review: - compact() failure cleanup now unlinks the temporary .compact-wal and .compact-shm files, matching _run_structural_migration(); previously only the main .compact file was removed. - _build_where() fails closed (1 = 0) when filters are requested but none translate, instead of emitting "()" which is invalid SQL; filters scope document access, so an empty translation must match no rows. - Drop the unused table_name constructor parameter (all SQL hardcodes DEFAULT_TABLE_NAME) and its callers in indexing.py. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * Enhancement(beta): guard sqlite-vec compaction swap against concurrent readers The compaction/migration file swap replaces the database via os.replace, but the -wal/-shm files are keyed by path, not inode. A reader holding an open connection across the swap leaves the old WAL aliased onto the new file; a subsequent write then corrupts the database (reproduced via PRAGMA integrity_check). Add a cross-process read/write lock (filelock.ReadWriteLock) over the index: - read_store() holds it shared for the whole connection lifetime (and closes the connection on exit); concurrent readers do not block. - compaction and the migration check run under an exclusive lock that drains readers, and skip with an info log on Timeout (maintenance op, retries next run). - Normal writes are untouched: WAL gives reader/writer concurrency and LLM_INDEX_LOCK still serializes writers, so they never block readers. load_or_build_index() now takes the store from the caller's read_store() so the lock and connection span the whole retrieval; chat holds it across the streamed response. Two new settings: LLM_INDEX_RWLOCK and LLM_INDEX_COMPACTION_LOCK_TIMEOUT. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * Ensures the store alays cleans up SQLite connections for any operations, even on errors --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
import logging
|
|
import sqlite3
|
|
import threading
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
from django.conf import settings
|
|
from filelock import ReadWriteLock
|
|
from llama_index.core.schema import TextNode
|
|
from pytest_django.fixtures import SettingsWrapper
|
|
|
|
from paperless_ai import indexing
|
|
from paperless_ai.vector_store import PaperlessSqliteVecVectorStore
|
|
|
|
DIM = 8
|
|
|
|
|
|
def _node(node_id: str, document_id: str, *, seed: float = 0.0) -> TextNode:
|
|
node = TextNode(
|
|
id_=node_id,
|
|
text="chunk",
|
|
metadata={"document_id": document_id, "modified": "2026-06-01T00:00:00"},
|
|
)
|
|
node.relationships = {}
|
|
node.embedding = [seed + i / 100 for i in range(DIM)]
|
|
return node
|
|
|
|
|
|
def _seed_bloated_index(index_dir: Path) -> None:
|
|
"""Create an index whose cumulative inserts far exceed live rows."""
|
|
store = PaperlessSqliteVecVectorStore(uri=str(index_dir))
|
|
store.add([_node(f"d{j}", str(j), seed=float(j)) for j in range(20)])
|
|
for cycle in range(6):
|
|
for j in range(20):
|
|
store.upsert_document(
|
|
str(j),
|
|
[_node(f"d{j}-c{cycle}", str(j), seed=float(j))],
|
|
)
|
|
store.client.close()
|
|
|
|
|
|
def _bloat_ratio(index_dir: Path) -> float:
|
|
store = PaperlessSqliteVecVectorStore(uri=str(index_dir))
|
|
live = store.client.execute("SELECT count(*) FROM documents").fetchone()[0]
|
|
row = store.client.execute(
|
|
"SELECT value FROM index_meta WHERE key = 'total_inserts'",
|
|
).fetchone()
|
|
total = int(row["value"]) if row else live
|
|
store.client.close()
|
|
return total / max(live, 1)
|
|
|
|
|
|
def _integrity_ok(index_dir: Path) -> bool:
|
|
store = PaperlessSqliteVecVectorStore(uri=str(index_dir))
|
|
result = store.client.execute("PRAGMA integrity_check").fetchone()[0]
|
|
rows = store.client.execute("SELECT count(*) FROM documents").fetchone()[0]
|
|
store.client.close()
|
|
return result == "ok" and rows == 20
|
|
|
|
|
|
def _reader_lock() -> ReadWriteLock:
|
|
# A distinct instance simulates a reader in another process: it coordinates
|
|
# with the production lock purely through SQLite, never reentrant upgrade.
|
|
return ReadWriteLock(str(settings.LLM_INDEX_RWLOCK), is_singleton=False)
|
|
|
|
|
|
class TestCompactionLock:
|
|
def test_compaction_skips_when_a_reader_holds_the_lock(
|
|
self,
|
|
temp_llm_index_dir: Path,
|
|
settings: SettingsWrapper,
|
|
caplog: pytest.LogCaptureFixture,
|
|
) -> None:
|
|
_seed_bloated_index(temp_llm_index_dir)
|
|
settings.LLM_INDEX_COMPACTION_LOCK_TIMEOUT = 0.3
|
|
|
|
lock = _reader_lock()
|
|
with lock.read_lock(), caplog.at_level(logging.INFO):
|
|
indexing.llm_index_compact() # must not raise
|
|
lock.close()
|
|
|
|
# Swap was skipped: bloat remains, nothing corrupted, data intact.
|
|
assert _integrity_ok(temp_llm_index_dir)
|
|
assert _bloat_ratio(temp_llm_index_dir) > 2
|
|
assert "Skipping LLM index compaction" in caplog.text
|
|
|
|
def test_compaction_runs_when_no_reader_holds_the_lock(
|
|
self,
|
|
temp_llm_index_dir: Path,
|
|
) -> None:
|
|
_seed_bloated_index(temp_llm_index_dir)
|
|
assert _bloat_ratio(temp_llm_index_dir) > 2
|
|
|
|
indexing.llm_index_compact()
|
|
|
|
assert _bloat_ratio(temp_llm_index_dir) == pytest.approx(1.0)
|
|
assert _integrity_ok(temp_llm_index_dir)
|
|
|
|
def test_normal_write_is_not_gated_by_the_compaction_lock(
|
|
self,
|
|
temp_llm_index_dir: Path,
|
|
) -> None:
|
|
"""A held exclusive lock must not block ordinary writes (WAL handles them)."""
|
|
_seed_bloated_index(temp_llm_index_dir)
|
|
done = threading.Event()
|
|
|
|
def remove() -> None:
|
|
indexing.llm_index_remove_document(MagicMock(id=999))
|
|
done.set()
|
|
|
|
holder = _reader_lock()
|
|
with holder.write_lock():
|
|
t = threading.Thread(target=remove)
|
|
t.start()
|
|
finished = done.wait(timeout=5)
|
|
t.join(timeout=2)
|
|
holder.close()
|
|
assert finished, "a normal write blocked on the compaction lock"
|
|
|
|
|
|
class TestReadStore:
|
|
def test_closes_connection_on_exit(self, temp_llm_index_dir: Path) -> None:
|
|
with indexing.read_store() as store:
|
|
conn = store.client
|
|
assert conn.execute("SELECT 1").fetchone()[0] == 1
|
|
with pytest.raises(sqlite3.ProgrammingError):
|
|
conn.execute("SELECT 1")
|
|
|
|
def test_concurrent_readers_do_not_block(self, temp_llm_index_dir: Path) -> None:
|
|
_seed_bloated_index(temp_llm_index_dir)
|
|
with indexing.read_store() as a, indexing.read_store() as b:
|
|
assert a.table_exists()
|
|
assert b.table_exists()
|