mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-24 22:34:22 +00:00
Feature: Replace Whoosh with tantivy search backend (#12471)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Antoine Mérino <3023499+Merinorus@users.noreply.github.com>
This commit is contained in:
+13
-32
@@ -4,11 +4,9 @@ import shutil
|
||||
import uuid
|
||||
import zipfile
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from tempfile import mkstemp
|
||||
from typing import TypeVar
|
||||
|
||||
from celery import Task
|
||||
from celery import shared_task
|
||||
@@ -20,9 +18,7 @@ from django.db import transaction
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents import sanity_checker
|
||||
from documents.barcodes import BarcodePlugin
|
||||
from documents.bulk_download import ArchiveOnlyStrategy
|
||||
@@ -60,7 +56,9 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import IterWrapper
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import identity
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
@@ -69,34 +67,16 @@ from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
logger = logging.getLogger("paperless.tasks")
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
return iterable
|
||||
|
||||
|
||||
@shared_task
|
||||
def index_optimize() -> None:
|
||||
ix = index.open_index()
|
||||
writer = AsyncWriter(ix)
|
||||
writer.commit(optimize=True)
|
||||
|
||||
|
||||
def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
|
||||
documents = Document.objects.all()
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in iter_wrapper(documents):
|
||||
index.update_document(writer, document)
|
||||
logger.info(
|
||||
"index_optimize is a no-op — Tantivy manages segment merging automatically.",
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
@@ -270,9 +250,9 @@ def sanity_check(*, scheduled=True, raise_on_error=True):
|
||||
|
||||
@shared_task
|
||||
def bulk_update_documents(document_ids) -> None:
|
||||
documents = Document.objects.filter(id__in=document_ids)
|
||||
from documents.search import get_backend
|
||||
|
||||
ix = index.open_index()
|
||||
documents = Document.objects.filter(id__in=document_ids)
|
||||
|
||||
for doc in documents:
|
||||
clear_document_caches(doc.pk)
|
||||
@@ -283,9 +263,9 @@ def bulk_update_documents(document_ids) -> None:
|
||||
)
|
||||
post_save.send(Document, instance=doc, created=False)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
with get_backend().batch_update() as batch:
|
||||
for doc in documents:
|
||||
index.update_document(writer, doc)
|
||||
batch.add_or_update(doc)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
@@ -389,8 +369,9 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
from documents.search import get_backend
|
||||
|
||||
get_backend().add_or_update(document)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
@@ -633,7 +614,7 @@ def update_document_parent_tags(tag: Tag, new_parent: Tag) -> None:
|
||||
@shared_task
|
||||
def llmindex_index(
|
||||
*,
|
||||
iter_wrapper: IterWrapper[Document] = _identity,
|
||||
iter_wrapper: IterWrapper[Document] = identity,
|
||||
rebuild=False,
|
||||
scheduled=True,
|
||||
auto=False,
|
||||
|
||||
Reference in New Issue
Block a user