mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-23 17:32:44 +00:00
* Chore: move Tika parser and tests to paperless/ Move TikaDocumentParser and its tests to the canonical parser package location, matching the pattern established for TextDocumentParser: - src/paperless_tika/parsers.py → src/paperless/parsers/tika.py - src/paperless_tika/tests/test_tika_parser.py → src/paperless/tests/parsers/test_tika_parser.py - src/paperless_tika/tests/samples/ → src/paperless/tests/samples/tika/ Merge tika fixtures (tika_parser, sample_odt_file, sample_docx_file, sample_doc_file, sample_broken_odt) into the shared parsers conftest. Remove the now-empty src/paperless_tika/tests/conftest.py. Content is unchanged — this commit is rename-only so git history is preserved on the moved files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Feature: Phase 3 — migrate TikaDocumentParser to ParserProtocol Refactor TikaDocumentParser to satisfy ParserProtocol without subclassing the legacy DocumentParser ABC: - Add ClassVars: name, version, author, url - Add supported_mime_types() classmethod (12 Office/ODF/RTF MIME types) - Add score() classmethod — returns None when TIKA_ENABLED is False, 10 otherwise - can_produce_archive = False (PDF is for display, not an OCR archive) - requires_pdf_rendition = True (Office formats need PDF for browser display) - __enter__/__exit__ via ExitStack: TikaClient opened once per parser lifetime and shared across parse() and extract_metadata() calls - extract_metadata() falls back to a short-lived TikaClient when called outside a context manager (legacy view-layer metadata path) - _convert_to_pdf() uses OutputTypeConfig() to honour the database-stored ApplicationConfiguration before falling back to the env-var setting - Rename convert_to_pdf → _convert_to_pdf (private helper) Update paperless_tika/signals.py shim to import from the new module path and drop the legacy logging_group/progress_callback kwargs. Update documents/consumer.py to extend the existing TextDocumentParser special cases to also cover TikaDocumentParser (parse/get_thumbnail signatures, __exit__ cleanup). Add TestTikaParserRegistryInterface (7 tests) covering score(), properties, and ParserProtocol isinstance check. Update existing tests to use the new accessor API (get_text, get_date, get_archive_path, _convert_to_pdf). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: update remaining imports and move live Tika tests after parser migration - src/documents/tests/test_parsers.py: import TikaDocumentParser from paperless.parsers.tika (old paperless_tika.parsers no longer exists) - git mv paperless_tika/tests/test_live_tika.py → paperless/tests/parsers/test_live_tika.py to co-locate all Tika tests with the parser; update import and replace old attribute API (tika_parser.text/.archive_path) with accessor methods (get_text/get_archive_path) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: satisfy mypy and pyrefly for TikaDocumentParser Use a TYPE_CHECKING-guarded assert to narrow self._tika_client from TikaClient | None to TikaClient at the point of use in parse(). The assert is visible to type checkers (TYPE_CHECKING=True) so both mypy and pyrefly accept the subsequent attribute accesses without error; at runtime TYPE_CHECKING is False so the assert never executes and no ruff S101 suppression is required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix: require context manager for TikaDocumentParser; clean up client lifecycle - consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
171 lines
4.2 KiB
YAML
171 lines
4.2 KiB
YAML
# Please see the documentation for all configuration options:
|
|
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
|
version: 2
|
|
# Required for uv support for now
|
|
enable-beta-ecosystems: true
|
|
updates:
|
|
# Enable version updates for pnpm
|
|
- package-ecosystem: "npm"
|
|
target-branch: "dev"
|
|
# Look for `pnpm-lock.yaml` file in the `/src-ui` directory
|
|
directory: "/src-ui"
|
|
open-pull-requests-limit: 10
|
|
schedule:
|
|
interval: "monthly"
|
|
cooldown:
|
|
default-days: 7
|
|
labels:
|
|
- "frontend"
|
|
- "dependencies"
|
|
groups:
|
|
frontend-angular-dependencies:
|
|
patterns:
|
|
- "@angular*"
|
|
- "@ng-*"
|
|
- "ngx-*"
|
|
- "ng2-pdf-viewer"
|
|
frontend-jest-dependencies:
|
|
patterns:
|
|
- "@types/jest"
|
|
- "jest*"
|
|
frontend-eslint-dependencies:
|
|
patterns:
|
|
- "@typescript-eslint*"
|
|
- "eslint"
|
|
# Enable version updates for Python
|
|
- package-ecosystem: "uv"
|
|
target-branch: "dev"
|
|
directory: "/"
|
|
# Check for updates once a week
|
|
schedule:
|
|
interval: "monthly"
|
|
cooldown:
|
|
default-days: 7
|
|
labels:
|
|
- "backend"
|
|
- "dependencies"
|
|
groups:
|
|
# Development & CI/CD Tooling
|
|
development:
|
|
patterns:
|
|
- "*pytest*"
|
|
- "ruff"
|
|
- "zensical"
|
|
- "prek*"
|
|
# Django & DRF Ecosystem
|
|
django-ecosystem:
|
|
patterns:
|
|
- "*django*"
|
|
- "drf-*"
|
|
- "djangorestframework"
|
|
- "whitenoise"
|
|
- "bleach"
|
|
- "jinja2"
|
|
# Async, Task Queuing & Caching
|
|
async-tasks:
|
|
patterns:
|
|
- "celery*"
|
|
- "channels*"
|
|
- "flower"
|
|
- "redis"
|
|
# Document, PDF, and OCR Processing
|
|
document-processing:
|
|
patterns:
|
|
- "ocrmypdf"
|
|
- "pdf2image"
|
|
- "zxing-cpp"
|
|
- "tika-client"
|
|
- "gotenberg-client"
|
|
- "python-magic"
|
|
- "python-gnupg"
|
|
# Data, NLP, and Search
|
|
data-nlp-search:
|
|
patterns:
|
|
- "nltk"
|
|
- "scikit-learn"
|
|
- "langdetect"
|
|
- "rapidfuzz"
|
|
- "whoosh-reloaded"
|
|
# Utilities (Patch Updates)
|
|
utilities-patch:
|
|
update-types:
|
|
- "patch"
|
|
# Utilities (Minor Updates)
|
|
utilities-minor:
|
|
update-types:
|
|
- "minor"
|
|
# Enable updates for GitHub Actions
|
|
- package-ecosystem: "github-actions"
|
|
target-branch: "dev"
|
|
directory: "/"
|
|
schedule:
|
|
# Check for updates to GitHub Actions every month
|
|
interval: "monthly"
|
|
cooldown:
|
|
default-days: 7
|
|
labels:
|
|
- "ci-cd"
|
|
- "dependencies"
|
|
groups:
|
|
actions:
|
|
update-types:
|
|
- "major"
|
|
- "minor"
|
|
- "patch"
|
|
# Update Dockerfile in root directory
|
|
- package-ecosystem: "docker"
|
|
directories:
|
|
- "/"
|
|
- "/.devcontainer/"
|
|
schedule:
|
|
interval: "monthly"
|
|
cooldown:
|
|
default-days: 7
|
|
open-pull-requests-limit: 5
|
|
labels:
|
|
- "dependencies"
|
|
commit-message:
|
|
prefix: "docker"
|
|
include: "scope"
|
|
# Update Docker Compose files in docker/compose directory
|
|
- package-ecosystem: "docker-compose"
|
|
directory: "/docker/compose/"
|
|
schedule:
|
|
interval: "monthly"
|
|
cooldown:
|
|
default-days: 7
|
|
open-pull-requests-limit: 5
|
|
labels:
|
|
- "dependencies"
|
|
commit-message:
|
|
prefix: "docker-compose"
|
|
include: "scope"
|
|
groups:
|
|
# Individual groups for each image
|
|
gotenberg:
|
|
patterns:
|
|
- "docker.io/gotenberg/gotenberg*"
|
|
tika:
|
|
patterns:
|
|
- "docker.io/apache/tika*"
|
|
redis:
|
|
patterns:
|
|
- "docker.io/library/redis*"
|
|
mariadb:
|
|
patterns:
|
|
- "docker.io/library/mariadb*"
|
|
postgres:
|
|
patterns:
|
|
- "docker.io/library/postgres*"
|
|
greenmail:
|
|
patterns:
|
|
- "docker.io/greenmail*"
|
|
- package-ecosystem: "pre-commit" # See documentation for possible values
|
|
directory: "/" # Location of package manifests
|
|
schedule:
|
|
interval: "monthly"
|
|
groups:
|
|
pre-commit-dependencies:
|
|
patterns:
|
|
- "*"
|