mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-02 02:04:19 +00:00
2c58d86380
* Fix: Remove all nodes for multi-chunk documents in update_llm_index incremental path The existing_nodes dict comprehension keyed on document_id silently dropped all but the last node per document, so only that one node was deleted when a modified document was re-indexed, leaving all other chunks as ghost vectors in the FAISS index. Switch to a defaultdict(list) that collects every node per document_id, then iterate and delete all of them before inserting fresh nodes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: Wire document_updated signal to LLM index update handler Connect document_updated to add_or_update_document_in_llm_index in DocumentsConfig.ready() so REST API edits (PATCH /api/documents/{id}/) enqueue an LLM vector store update, matching the existing document_consumption_finished behavior. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: Add file lock around FAISS index mutations to prevent concurrent write corruption Two concurrent Celery workers calling llm_index_add_or_update_document or llm_index_remove_document each loaded the same on-disk index independently, made their own change, and the last writer silently overwrote the first's update. Wrap both functions and the rebuild/persist body of update_llm_index in a filelock.FileLock keyed on LLM_INDEX_DIR/index.lock. Add a TOCTOU comment on queue_llm_index_update_if_needed explaining the residual risk (duplicate rebuild tasks are wasteful but not corrupting because the lock serialises the actual write). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: Apply _normalize() in extract_unmatched_names to prevent duplicate suggestions extract_unmatched_names was using .lower() while _match_names_to_queryset uses _normalize() (which also strips punctuation). A name like "J. Smith" matched to existing correspondent "J Smith" would still appear in the unmatched list, causing duplicate object creation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: Skip LLM index update gracefully when document has no indexable content Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: Persist empty index when all documents are deleted to clear stale FAISS vectors The early-return guard in update_llm_index fired before persist() when no documents existed, leaving a stale on-disk FAISS index that returned phantom hits for deleted document IDs. Now the guard only returns early for the incremental (rebuild=False) path when no index exists on disk; the rebuild path always continues through to persist(), producing an empty clean index. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Chore: Simplify incremental index update — use docs.values() and deduplicate node extend --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
103 lines
2.8 KiB
Python
103 lines
2.8 KiB
Python
import difflib
|
|
import logging
|
|
import re
|
|
|
|
from django.contrib.auth.models import User
|
|
|
|
from documents.models import Correspondent
|
|
from documents.models import DocumentType
|
|
from documents.models import StoragePath
|
|
from documents.models import Tag
|
|
from documents.permissions import get_objects_for_user_owner_aware
|
|
|
|
MATCH_THRESHOLD = 0.8
|
|
|
|
logger = logging.getLogger("paperless_ai.matching")
|
|
|
|
|
|
def match_tags_by_name(names: list[str], user: User) -> list[Tag]:
|
|
queryset = get_objects_for_user_owner_aware(
|
|
user,
|
|
["view_tag"],
|
|
Tag,
|
|
)
|
|
return _match_names_to_queryset(names, queryset, "name")
|
|
|
|
|
|
def match_correspondents_by_name(names: list[str], user: User) -> list[Correspondent]:
|
|
queryset = get_objects_for_user_owner_aware(
|
|
user,
|
|
["view_correspondent"],
|
|
Correspondent,
|
|
)
|
|
return _match_names_to_queryset(names, queryset, "name")
|
|
|
|
|
|
def match_document_types_by_name(names: list[str], user: User) -> list[DocumentType]:
|
|
queryset = get_objects_for_user_owner_aware(
|
|
user,
|
|
["view_documenttype"],
|
|
DocumentType,
|
|
)
|
|
return _match_names_to_queryset(names, queryset, "name")
|
|
|
|
|
|
def match_storage_paths_by_name(names: list[str], user: User) -> list[StoragePath]:
|
|
queryset = get_objects_for_user_owner_aware(
|
|
user,
|
|
["view_storagepath"],
|
|
StoragePath,
|
|
)
|
|
return _match_names_to_queryset(names, queryset, "name")
|
|
|
|
|
|
def _normalize(s: str) -> str:
|
|
s = s.lower()
|
|
s = re.sub(r"[^\w\s]", "", s) # remove punctuation
|
|
s = s.strip()
|
|
return s
|
|
|
|
|
|
def _match_names_to_queryset(names: list[str], queryset, attr: str):
|
|
results = []
|
|
objects = list(queryset)
|
|
object_names = [_normalize(getattr(obj, attr)) for obj in objects]
|
|
|
|
for name in names:
|
|
if not name:
|
|
continue
|
|
target = _normalize(name)
|
|
|
|
# First try exact match
|
|
if target in object_names:
|
|
index = object_names.index(target)
|
|
matched = objects.pop(index)
|
|
object_names.pop(index) # keep object list aligned after removal
|
|
results.append(matched)
|
|
continue
|
|
|
|
# Fuzzy match fallback
|
|
matches = difflib.get_close_matches(
|
|
target,
|
|
object_names,
|
|
n=1,
|
|
cutoff=MATCH_THRESHOLD,
|
|
)
|
|
if matches:
|
|
index = object_names.index(matches[0])
|
|
matched = objects.pop(index)
|
|
object_names.pop(index)
|
|
results.append(matched)
|
|
else:
|
|
pass
|
|
return results
|
|
|
|
|
|
def extract_unmatched_names(
|
|
names: list[str],
|
|
matched_objects: list,
|
|
attr="name",
|
|
) -> list[str]:
|
|
matched_names = {_normalize(getattr(obj, attr)) for obj in matched_objects}
|
|
return [name for name in names if _normalize(name) not in matched_names]
|