Compare commits

..

1 Commits

Author SHA1 Message Date
Trenton H
f2a48bb929 Locks down permissions to the job level with least privledge we can get away with 2026-03-02 10:27:54 -08:00
43 changed files with 590 additions and 1510 deletions

View File

@@ -22,6 +22,7 @@ on:
concurrency:
group: backend-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions: {}
env:
DEFAULT_UV_VERSION: "0.10.x"
NLTK_DATA: "/usr/share/nltk_data"
@@ -29,24 +30,26 @@ jobs:
test:
name: "Python ${{ matrix.python-version }}"
runs-on: ubuntu-24.04
permissions:
contents: read
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
fail-fast: false
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Start containers
run: |
docker compose --file docker/compose/docker-compose.ci-test.yml pull --quiet
docker compose --file docker/compose/docker-compose.ci-test.yml up --detach
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
uses: actions/setup-python@v6
with:
python-version: "${{ matrix.python-version }}"
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
@@ -83,13 +86,13 @@ jobs:
pytest
- name: Upload test results to Codecov
if: always()
uses: codecov/codecov-action@v5.5.2
uses: codecov/codecov-action@v5
with:
flags: backend-python-${{ matrix.python-version }}
files: junit.xml
report_type: test_results
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.5.2
uses: codecov/codecov-action@v5
with:
flags: backend-python-${{ matrix.python-version }}
files: coverage.xml
@@ -102,18 +105,20 @@ jobs:
typing:
name: Check project typing
runs-on: ubuntu-24.04
permissions:
contents: read
env:
DEFAULT_PYTHON: "3.12"
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6.0.1
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
with:
python-version: "${{ env.DEFAULT_PYTHON }}"
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7.2.1
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true

View File

@@ -15,6 +15,7 @@ on:
concurrency:
group: docker-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions: {}
env:
REGISTRY: ghcr.io
jobs:
@@ -41,7 +42,7 @@ jobs:
ref-name: ${{ steps.ref.outputs.name }}
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6.0.1
- name: Determine ref name
id: ref
run: |
@@ -130,7 +131,7 @@ jobs:
type=semver,pattern={{major}}.{{minor}}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6.19.2
uses: docker/build-push-action@v6.18.0
with:
context: .
file: ./Dockerfile
@@ -152,7 +153,7 @@ jobs:
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
if: steps.check-push.outputs.should-push == 'true'
uses: actions/upload-artifact@v7.0.0
uses: actions/upload-artifact@v6.0.0
with:
name: digests-${{ matrix.arch }}
path: /tmp/digests/*
@@ -168,7 +169,7 @@ jobs:
packages: write
steps:
- name: Download digests
uses: actions/download-artifact@v8.0.0
uses: actions/download-artifact@v7.0.0
with:
path: /tmp/digests
pattern: digests-*

View File

@@ -21,10 +21,7 @@ on:
concurrency:
group: docs-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
pages: write
id-token: write
permissions: {}
env:
DEFAULT_UV_VERSION: "0.10.x"
DEFAULT_PYTHON_VERSION: "3.12"
@@ -32,17 +29,19 @@ jobs:
build:
name: Build Documentation
runs-on: ubuntu-24.04
permissions:
contents: read
steps:
- uses: actions/configure-pages@v5.0.0
- uses: actions/configure-pages@v5
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
uses: actions/setup-python@v6
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
@@ -58,7 +57,7 @@ jobs:
--frozen \
zensical build --clean
- name: Upload GitHub Pages artifact
uses: actions/upload-pages-artifact@v4.0.0
uses: actions/upload-pages-artifact@v4
with:
path: site
name: github-pages-${{ github.run_id }}-${{ github.run_attempt }}
@@ -67,12 +66,16 @@ jobs:
needs: build
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
runs-on: ubuntu-24.04
permissions:
contents: read
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy GitHub Pages
uses: actions/deploy-pages@v4.0.5
uses: actions/deploy-pages@v4
id: deployment
with:
artifact_name: github-pages-${{ github.run_id }}-${{ github.run_attempt }}

View File

@@ -16,26 +16,29 @@ on:
concurrency:
group: frontend-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions: {}
jobs:
install-dependencies:
name: Install Dependencies
runs-on: ubuntu-24.04
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
id: cache-frontend-deps
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store
@@ -47,21 +50,23 @@ jobs:
name: Lint
needs: install-dependencies
runs-on: ubuntu-24.04
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store
@@ -75,6 +80,8 @@ jobs:
name: "Unit Tests (${{ matrix.shard-index }}/${{ matrix.shard-count }})"
needs: install-dependencies
runs-on: ubuntu-24.04
permissions:
contents: read
strategy:
fail-fast: false
matrix:
@@ -83,19 +90,19 @@ jobs:
shard-count: [4]
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store
@@ -107,13 +114,13 @@ jobs:
run: cd src-ui && pnpm run test --max-workers=2 --shard=${{ matrix.shard-index }}/${{ matrix.shard-count }}
- name: Upload test results to Codecov
if: always()
uses: codecov/codecov-action@v5.5.2
uses: codecov/codecov-action@v5
with:
flags: frontend-node-${{ matrix.node-version }}
directory: src-ui/
report_type: test_results
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.5.2
uses: codecov/codecov-action@v5
with:
flags: frontend-node-${{ matrix.node-version }}
directory: src-ui/coverage/
@@ -121,6 +128,8 @@ jobs:
name: "E2E Tests (${{ matrix.shard-index }}/${{ matrix.shard-count }})"
needs: install-dependencies
runs-on: ubuntu-24.04
permissions:
contents: read
container: mcr.microsoft.com/playwright:v1.58.2-noble
env:
PLAYWRIGHT_BROWSERS_PATH: /ms-playwright
@@ -133,19 +142,19 @@ jobs:
shard-count: [2]
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store
@@ -161,21 +170,23 @@ jobs:
name: Bundle Analysis
needs: [unit-tests, e2e-tests]
runs-on: ubuntu-24.04
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store

View File

@@ -9,10 +9,13 @@ on:
concurrency:
group: lint-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions: {}
jobs:
lint:
name: Linting via prek
runs-on: ubuntu-slim
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6.0.2

View File

@@ -7,6 +7,7 @@ on:
concurrency:
group: release-${{ github.ref }}
cancel-in-progress: false
permissions: {}
env:
DEFAULT_UV_VERSION: "0.10.x"
DEFAULT_PYTHON_VERSION: "3.12"
@@ -14,6 +15,10 @@ jobs:
wait-for-docker:
name: Wait for Docker Build
runs-on: ubuntu-24.04
permissions:
# lewagon/wait-on-check-action reads workflow check runs
actions: read
contents: read
steps:
- name: Wait for Docker build
uses: lewagon/wait-on-check-action@v1.5.0
@@ -26,16 +31,18 @@ jobs:
name: Build Release
needs: wait-for-docker
runs-on: ubuntu-24.04
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
# ---- Frontend Build ----
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
@@ -47,11 +54,11 @@ jobs:
# ---- Backend Setup ----
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
uses: actions/setup-python@v6
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
@@ -118,7 +125,7 @@ jobs:
sudo chown -R 1000:1000 paperless-ngx/
tar -cJf paperless-ngx.tar.xz paperless-ngx/
- name: Upload release artifact
uses: actions/upload-artifact@v7.0.0
uses: actions/upload-artifact@v6
with:
name: release
path: dist/paperless-ngx.tar.xz
@@ -127,13 +134,17 @@ jobs:
name: Publish Release
needs: build-release
runs-on: ubuntu-24.04
permissions:
# release-drafter reads PRs to build the changelog and creates/publishes the release
contents: write
pull-requests: read
outputs:
prerelease: ${{ steps.get-version.outputs.prerelease }}
changelog: ${{ steps.create-release.outputs.body }}
version: ${{ steps.get-version.outputs.version }}
steps:
- name: Download release artifact
uses: actions/download-artifact@v8.0.0
uses: actions/download-artifact@v7
with:
name: release
path: ./
@@ -148,7 +159,7 @@ jobs:
fi
- name: Create release and changelog
id: create-release
uses: release-drafter/release-drafter@v6.2.0
uses: release-drafter/release-drafter@v6
with:
name: Paperless-ngx ${{ steps.get-version.outputs.version }}
tag: ${{ steps.get-version.outputs.version }}
@@ -159,7 +170,7 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload release archive
uses: shogo82148/actions-upload-release-asset@v1.9.2
uses: shogo82148/actions-upload-release-asset@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
upload_url: ${{ steps.create-release.outputs.upload_url }}
@@ -174,18 +185,23 @@ jobs:
needs: publish-release
if: needs.publish-release.outputs.prerelease == 'false'
runs-on: ubuntu-24.04
permissions:
# git push of the changelog branch requires contents: write
# github.rest.pulls.create() and github.rest.issues.addLabels() require pull-requests: write
contents: write
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
with:
ref: main
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
uses: actions/setup-python@v6
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
@@ -218,7 +234,7 @@ jobs:
git commit -am "Changelog ${{ needs.publish-release.outputs.version }} - GHA"
git push origin ${{ needs.publish-release.outputs.version }}-changelog
- name: Create pull request
uses: actions/github-script@v8.0.0
uses: actions/github-script@v8
with:
script: |
const { repo, owner } = context.repo;

View File

@@ -12,6 +12,7 @@ on:
concurrency:
group: registry-tags-cleanup
cancel-in-progress: false
permissions: {}
jobs:
cleanup-images:
name: Cleanup Image Tags for ${{ matrix.primary-name }}

View File

@@ -18,6 +18,7 @@ on:
branches: [dev]
schedule:
- cron: '28 13 * * 5'
permissions: {}
jobs:
analyze:
name: Analyze
@@ -34,10 +35,10 @@ jobs:
# Learn more about CodeQL language support at https://git.io/codeql-language-support
steps:
- name: Checkout repository
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v4.32.5
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,4 +46,4 @@ jobs:
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4.32.5
uses: github/codeql-action/analyze@v4

View File

@@ -6,18 +6,23 @@ on:
push:
paths: ['src/locale/**', 'src-ui/messages.xlf', 'src-ui/src/locale/**']
branches: [dev]
permissions: {}
jobs:
synchronize-with-crowdin:
name: Crowdin Sync
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
# Crowdin action pushes translation branches and creates/updates PRs via GITHUB_TOKEN
contents: write
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
with:
token: ${{ secrets.PNGX_BOT_PAT }}
- name: crowdin action
uses: crowdin/github-action@v2.15.0
uses: crowdin/github-action@v2
with:
upload_translations: false
download_translations: true

View File

@@ -2,17 +2,19 @@ name: PR Bot
on:
pull_request_target:
types: [opened]
permissions:
contents: read
pull-requests: write
permissions: {}
jobs:
pr-bot:
name: Automated PR Bot
runs-on: ubuntu-latest
permissions:
# labeler reads file paths; all steps add labels or post comments on PRs
contents: read
pull-requests: write
steps:
- name: Label PR by file path or branch name
# see .github/labeler.yml for the labeler config
uses: actions/labeler@v6.0.1
uses: actions/labeler@v6
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Label by size
@@ -26,7 +28,7 @@ jobs:
fail_if_xl: 'false'
excluded_files: /\.lock$/ /\.txt$/ ^src-ui/pnpm-lock\.yaml$ ^src-ui/messages\.xlf$ ^src/locale/en_US/LC_MESSAGES/django\.po$
- name: Label by PR title
uses: actions/github-script@v8.0.0
uses: actions/github-script@v8
with:
script: |
const pr = context.payload.pull_request;
@@ -52,7 +54,7 @@ jobs:
}
- name: Label bot-generated PRs
if: ${{ contains(github.actor, 'dependabot') || contains(github.actor, 'crowdin-bot') }}
uses: actions/github-script@v8.0.0
uses: actions/github-script@v8
with:
script: |
const pr = context.payload.pull_request;
@@ -77,7 +79,7 @@ jobs:
}
- name: Welcome comment
if: ${{ !contains(github.actor, 'bot') }}
uses: actions/github-script@v8.0.0
uses: actions/github-script@v8
with:
script: |
const pr = context.payload.pull_request;

View File

@@ -7,18 +7,19 @@ on:
branches:
- main
- dev
permissions:
contents: read
permissions: {}
jobs:
pr_opened_or_reopened:
name: pr_opened_or_reopened
runs-on: ubuntu-24.04
permissions:
# release-drafter reads its config file from the repo
contents: read
# write permission is required for autolabeler
pull-requests: write
if: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'reopened') && github.event.pull_request.user.login != 'dependabot'
steps:
- name: Label PR with release-drafter
uses: release-drafter/release-drafter@v6.2.0
uses: release-drafter/release-drafter@v6
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -3,10 +3,7 @@ on:
schedule:
- cron: '0 3 * * *'
workflow_dispatch:
permissions:
issues: write
pull-requests: write
discussions: write
permissions: {}
concurrency:
group: lock
jobs:
@@ -14,8 +11,11 @@ jobs:
name: 'Stale'
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v10.2.0
- uses: actions/stale@v10
with:
days-before-stale: 7
days-before-close: 14
@@ -36,8 +36,12 @@ jobs:
name: 'Lock Old Threads'
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
issues: write
pull-requests: write
discussions: write
steps:
- uses: dessant/lock-threads@v6.0.0
- uses: dessant/lock-threads@v6
with:
issue-inactive-days: '30'
pr-inactive-days: '30'
@@ -56,8 +60,10 @@ jobs:
name: 'Close Answered Discussions'
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
discussions: write
steps:
- uses: actions/github-script@v8.0.0
- uses: actions/github-script@v8
with:
script: |
function sleep(ms) {
@@ -113,8 +119,10 @@ jobs:
name: 'Close Outdated Discussions'
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
discussions: write
steps:
- uses: actions/github-script@v8.0.0
- uses: actions/github-script@v8
with:
script: |
function sleep(ms) {
@@ -205,8 +213,10 @@ jobs:
name: 'Close Unsupported Feature Requests'
if: github.repository_owner == 'paperless-ngx'
runs-on: ubuntu-24.04
permissions:
discussions: write
steps:
- uses: actions/github-script@v8.0.0
- uses: actions/github-script@v8
with:
script: |
function sleep(ms) {

View File

@@ -3,6 +3,7 @@ on:
push:
branches:
- dev
permissions: {}
jobs:
generate-translate-strings:
name: Generate Translation Strings
@@ -11,7 +12,7 @@ jobs:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v6.0.2
uses: actions/checkout@v6
env:
GH_REF: ${{ github.ref }} # sonar rule:githubactions:S7630 - avoid injection
with:
@@ -19,13 +20,13 @@ jobs:
ref: ${{ env.GH_REF }}
- name: Set up Python
id: setup-python
uses: actions/setup-python@v6.2.0
uses: actions/setup-python@v6
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends gettext
- name: Install uv
uses: astral-sh/setup-uv@v7.3.1
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- name: Install backend python dependencies
@@ -36,18 +37,18 @@ jobs:
- name: Generate backend translation strings
run: cd src/ && uv run manage.py makemessages -l en_US -i "samples*"
- name: Install pnpm
uses: pnpm/action-setup@v4.2.0
uses: pnpm/action-setup@v4
with:
version: 10
- name: Use Node.js 24
uses: actions/setup-node@v6.2.0
uses: actions/setup-node@v6
with:
node-version: 24.x
cache: 'pnpm'
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
- name: Cache frontend dependencies
id: cache-frontend-deps
uses: actions/cache@v5.0.3
uses: actions/cache@v5
with:
path: |
~/.pnpm-store
@@ -63,7 +64,7 @@ jobs:
cd src-ui
pnpm run ng extract-i18n
- name: Commit changes
uses: stefanzweifel/git-auto-commit-action@v7.1.0
uses: stefanzweifel/git-auto-commit-action@v7
with:
file_pattern: 'src-ui/messages.xlf src/locale/en_US/LC_MESSAGES/django.po'
commit_message: "Auto translate strings"

View File

@@ -45,7 +45,7 @@ ENV \
ARG TARGETARCH
ARG TARGETVARIANT
# Lock this version
ARG S6_OVERLAY_VERSION=3.2.2.0
ARG S6_OVERLAY_VERSION=3.2.1.0
ARG S6_BUILD_TIME_PKGS="curl \
xz-utils"

View File

@@ -4,7 +4,7 @@
# correct networking for the tests
services:
gotenberg:
image: docker.io/gotenberg/gotenberg:8.27
image: docker.io/gotenberg/gotenberg:8.26
hostname: gotenberg
container_name: gotenberg
network_mode: host

View File

@@ -72,7 +72,7 @@ services:
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg:
image: docker.io/gotenberg/gotenberg:8.27
image: docker.io/gotenberg/gotenberg:8.26
restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.

View File

@@ -66,7 +66,7 @@ services:
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg:
image: docker.io/gotenberg/gotenberg:8.27
image: docker.io/gotenberg/gotenberg:8.26
restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.

View File

@@ -55,7 +55,7 @@ services:
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg:
image: docker.io/gotenberg/gotenberg:8.27
image: docker.io/gotenberg/gotenberg:8.26
restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.

View File

@@ -262,10 +262,6 @@ your files differently, you can do that by adjusting the
or using [storage paths (see below)](#storage-paths). Paperless adds the
correct file extension e.g. `.pdf`, `.jpg` automatically.
When a document has file versions, each version uses the same naming rules and
storage path resolution as any other document file, with an added version suffix
such as `_v1`, `_v2`, etc.
This variable allows you to configure the filename (folders are allowed)
using placeholders. For example, configuring this to
@@ -357,8 +353,6 @@ If paperless detects that two documents share the same filename,
paperless will automatically append `_01`, `_02`, etc to the filename.
This happens if all the placeholders in a filename evaluate to the same
value.
For versioned files, this counter is appended after the version suffix
(for example `statement_v2_01.pdf`).
If there are any errors in the placeholders included in `PAPERLESS_FILENAME_FORMAT`,
paperless will fall back to using the default naming scheme instead.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

View File

@@ -95,7 +95,6 @@ Think of versions as **file history** for a document.
- Versions track the underlying file and extracted text content (OCR/text).
- Metadata such as tags, correspondent, document type, storage path and custom fields stay on the "root" document.
- Version files follow normal filename formatting (including storage paths) and add a `_vN` suffix (for example `_v1`, `_v2`).
- By default, search and document content use the latest version.
- In document detail, selecting a version switches the preview, file metadata and content (and download etc buttons) to that version.
- Deleting a non-root version keeps metadata and falls back to the latest remaining version.
@@ -617,7 +616,7 @@ applied. You can use the following placeholders in the template with any trigger
- `{{added_day}}`: added day
- `{{added_time}}`: added time in HH:MM format
- `{{original_filename}}`: original file name without extension
- `{{filename}}`: current file name without extension (for "added" workflows this may not be final yet, you can use `{{original_filename}}`)
- `{{filename}}`: current file name without extension
- `{{doc_title}}`: current document title (cannot be used in title assignment)
The following placeholders are only available for "added" or "updated" triggers

View File

@@ -11,7 +11,6 @@ import magic
from django.conf import settings
from django.contrib.auth.models import User
from django.db import transaction
from django.db.models import Max
from django.db.models import Q
from django.utils import timezone
from filelock import FileLock
@@ -124,6 +123,22 @@ class ConsumerPluginMixin:
self.filename = self.metadata.filename or self.input_doc.original_file.name
if input_doc.root_document_id:
self.log.debug(
f"Document root document id: {input_doc.root_document_id}",
)
root_document = Document.objects.get(pk=input_doc.root_document_id)
version_index = Document.objects.filter(root_document=root_document).count()
filename_path = Path(self.filename)
if filename_path.suffix:
self.filename = str(
filename_path.with_name(
f"{filename_path.stem}_v{version_index}{filename_path.suffix}",
),
)
else:
self.filename = f"{self.filename}_v{version_index}"
def _send_progress(
self,
current_progress: int,
@@ -169,7 +184,7 @@ class ConsumerPlugin(
):
logging_name = LOGGING_NAME
def _create_version_from_root(
def _clone_root_into_version(
self,
root_doc: Document,
*,
@@ -178,38 +193,30 @@ class ConsumerPlugin(
mime_type: str,
) -> Document:
self.log.debug("Saving record for updated version to database")
root_doc_frozen = Document.objects.select_for_update().get(pk=root_doc.pk)
next_version_index = (
Document.global_objects.filter(
root_document_id=root_doc_frozen.pk,
).aggregate(
max_index=Max("version_index"),
)["max_index"]
or 0
)
version_doc = Document.objects.get(pk=root_doc.pk)
setattr(version_doc, "pk", None)
version_doc.root_document = root_doc
file_for_checksum = (
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy
)
version_doc = Document(
root_document=root_doc_frozen,
version_index=next_version_index + 1,
checksum=hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest(),
content=text or "",
page_count=page_count,
mime_type=mime_type,
original_filename=self.filename,
owner_id=root_doc_frozen.owner_id,
created=root_doc_frozen.created,
title=root_doc_frozen.title,
added=timezone.now(),
modified=timezone.now(),
)
version_doc.checksum = hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest()
version_doc.content = text or ""
version_doc.page_count = page_count
version_doc.mime_type = mime_type
version_doc.original_filename = self.filename
version_doc.storage_path = root_doc.storage_path
# Clear unique file path fields so they can be generated uniquely later
version_doc.filename = None
version_doc.archive_filename = None
version_doc.archive_checksum = None
if self.metadata.version_label is not None:
version_doc.version_label = self.metadata.version_label
version_doc.added = timezone.now()
version_doc.modified = timezone.now()
return version_doc
def run_pre_consume_script(self) -> None:
@@ -535,7 +542,7 @@ class ConsumerPlugin(
root_doc = Document.objects.get(
pk=self.input_doc.root_document_id,
)
original_document = self._create_version_from_root(
original_document = self._clone_root_into_version(
root_doc,
text=text,
page_count=page_count,

View File

@@ -128,18 +128,11 @@ def generate_filename(
counter=0,
archive_filename=False,
) -> Path:
# version docs use the root document for formatting, just with a suffix
context_doc = doc if doc.root_document_id is None else doc.root_document
version_suffix = (
f"_v{doc.version_index}"
if doc.root_document_id is not None and doc.version_index is not None
else ""
)
base_path: Path | None = None
# Determine the source of the format string
if context_doc.storage_path is not None:
filename_format = context_doc.storage_path.path
if doc.storage_path is not None:
filename_format = doc.storage_path.path
elif settings.FILENAME_FORMAT is not None:
# Maybe convert old to new style
filename_format = convert_format_str_to_template_format(
@@ -150,7 +143,7 @@ def generate_filename(
# If we have one, render it
if filename_format is not None:
rendered_path: str | None = format_filename(context_doc, filename_format)
rendered_path: str | None = format_filename(doc, filename_format)
if rendered_path:
base_path = Path(rendered_path)
@@ -164,7 +157,7 @@ def generate_filename(
base_filename = base_path.name
# Build the final filename with counter and filetype
final_filename = f"{base_filename}{version_suffix}{counter_str}{filetype_str}"
final_filename = f"{base_filename}{counter_str}{filetype_str}"
# If we have a directory component, include it
if str(directory) != ".":
@@ -173,9 +166,7 @@ def generate_filename(
full_path = Path(final_filename)
else:
# No template, use document ID
final_filename = (
f"{context_doc.pk:07}{version_suffix}{counter_str}{filetype_str}"
)
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
full_path = Path(final_filename)
return full_path

View File

@@ -1,25 +1,22 @@
from django.core.management import BaseCommand
from django.db import transaction
from documents.management.commands.base import PaperlessCommand
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import index_optimize
from documents.tasks import index_reindex
class Command(PaperlessCommand):
class Command(ProgressBarMixin, BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument("command", choices=["reindex", "optimize"])
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
if options["command"] == "reindex":
index_reindex(
iter_wrapper=lambda docs: self.track(
docs,
description="Indexing documents...",
),
)
index_reindex(progress_bar_disable=self.no_progress_bar)
elif options["command"] == "optimize":
index_optimize()

View File

@@ -1,22 +1,22 @@
from typing import Any
from django.core.management import BaseCommand
from django.db import transaction
from documents.management.commands.base import PaperlessCommand
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import llmindex_index
class Command(PaperlessCommand):
class Command(ProgressBarMixin, BaseCommand):
help = "Manages the LLM-based vector index for Paperless."
def add_arguments(self, parser: Any) -> None:
super().add_arguments(parser)
def add_arguments(self, parser):
parser.add_argument("command", choices=["rebuild", "update"])
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args: Any, **options: Any) -> None:
llmindex_index(
rebuild=options["command"] == "rebuild",
scheduled=False,
iter_wrapper=lambda docs: self.track(
docs,
description="Indexing documents...",
),
)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
llmindex_index(
progress_bar_disable=self.no_progress_bar,
rebuild=options["command"] == "rebuild",
scheduled=False,
)

View File

@@ -1,117 +1,17 @@
"""Management command to check the document archive for issues."""
from django.core.management.base import BaseCommand
from __future__ import annotations
import logging
from typing import Any
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from documents.management.commands.base import PaperlessCommand
from documents.models import Document
from documents.sanity_checker import SanityCheckMessages
from documents.management.commands.mixins import ProgressBarMixin
from documents.sanity_checker import check_sanity
_LEVEL_STYLE: dict[int, tuple[str, str]] = {
logging.ERROR: ("bold red", "ERROR"),
logging.WARNING: ("yellow", "WARN"),
logging.INFO: ("dim", "INFO"),
}
class Command(PaperlessCommand):
class Command(ProgressBarMixin, BaseCommand):
help = "This command checks your document archive for issues."
def _render_results(self, messages: SanityCheckMessages) -> None:
"""Render sanity check results as a Rich table."""
def add_arguments(self, parser):
self.add_argument_progress_bar_mixin(parser)
if (
not messages.has_error
and not messages.has_warning
and not messages.has_info
):
self.console.print(
Panel(
"[green]No issues detected.[/green]",
title="Sanity Check",
border_style="green",
),
)
return
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
# Build a lookup for document titles
doc_pks = [pk for pk in messages.document_pks() if pk is not None]
titles: dict[int, str] = {}
if doc_pks:
titles = dict(
Document.global_objects.filter(pk__in=doc_pks)
.only("pk", "title")
.values_list("pk", "title"),
)
table = Table(
title="Sanity Check Results",
show_lines=True,
title_style="bold",
)
table.add_column("Level", width=7, no_wrap=True)
table.add_column("Document", min_width=20)
table.add_column("Issue", ratio=1)
for doc_pk, doc_messages in messages.iter_messages():
if doc_pk is not None:
title = titles.get(doc_pk, "Unknown")
doc_label = f"#{doc_pk} {title}"
else:
doc_label = "(global)"
for msg in doc_messages:
style, label = _LEVEL_STYLE.get(
msg["level"],
("dim", "INFO"),
)
table.add_row(
Text(label, style=style),
Text(doc_label),
Text(str(msg["message"])),
)
self.console.print(table)
parts: list[str] = []
if messages.document_error_count:
parts.append(
f"{messages.document_error_count} document(s) with [bold red]errors[/bold red]",
)
if messages.document_warning_count:
parts.append(
f"{messages.document_warning_count} document(s) with [yellow]warnings[/yellow]",
)
if messages.document_info_count:
parts.append(f"{messages.document_info_count} document(s) with infos")
if messages.global_warning_count:
parts.append(
f"{messages.global_warning_count} global [yellow]warning(s)[/yellow]",
)
if parts:
if len(parts) > 1:
summary = ", ".join(parts[:-1]) + " and " + parts[-1]
else:
summary = parts[0]
self.console.print(f"\nFound {summary}.")
else:
self.console.print("\nNo issues found.")
def handle(self, *args: Any, **options: Any) -> None:
messages = check_sanity(
scheduled=False,
iter_wrapper=lambda docs: self.track(
docs,
description="Checking documents...",
),
)
self._render_results(messages)
messages.log_messages()

View File

@@ -1,37 +0,0 @@
# Generated by Django 5.2.11 on 2026-03-02 17:48
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0013_document_root_document"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name="document",
name="version_index",
field=models.PositiveIntegerField(
blank=True,
db_index=True,
help_text="Index of this version within the root document.",
null=True,
verbose_name="version index",
),
),
migrations.AddConstraint(
model_name="document",
constraint=models.UniqueConstraint(
condition=models.Q(
("root_document__isnull", False),
("version_index__isnull", False),
),
fields=("root_document", "version_index"),
name="documents_document_root_version_index_uniq",
),
),
]

View File

@@ -317,14 +317,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
verbose_name=_("root document for this version"),
)
version_index = models.PositiveIntegerField(
_("version index"),
blank=True,
null=True,
db_index=True,
help_text=_("Index of this version within the root document."),
)
version_label = models.CharField(
_("version label"),
max_length=64,
@@ -337,16 +329,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
ordering = ("-created",)
verbose_name = _("document")
verbose_name_plural = _("documents")
constraints = [
models.UniqueConstraint(
fields=["root_document", "version_index"],
condition=models.Q(
root_document__isnull=False,
version_index__isnull=False,
),
name="documents_document_root_version_index_uniq",
),
]
def __str__(self) -> str:
created = self.created.isoformat()

View File

@@ -1,174 +1,80 @@
"""
Sanity checker for the Paperless-ngx document archive.
Verifies that all documents have valid files, correct checksums,
and consistent metadata. Reports orphaned files in the media directory.
Progress display is the caller's responsibility -- pass an ``iter_wrapper``
to wrap the document queryset (e.g., with a progress bar). The default
is an identity function that adds no overhead.
"""
from __future__ import annotations
import hashlib
import logging
import uuid
from collections import defaultdict
from collections.abc import Callable
from collections.abc import Iterable
from collections.abc import Iterator
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Final
from typing import TypedDict
from typing import TypeVar
from celery import states
from django.conf import settings
from django.utils import timezone
from tqdm import tqdm
from documents.models import Document
from documents.models import PaperlessTask
from paperless.config import GeneralConfig
logger = logging.getLogger("paperless.sanity_checker")
_T = TypeVar("_T")
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
class MessageEntry(TypedDict):
"""A single sanity check message with its severity level."""
level: int
message: str
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
"""Pass through an iterable unchanged (default iter_wrapper)."""
return iterable
class SanityCheckMessages:
"""Collects sanity check messages grouped by document primary key.
Messages are categorized as error, warning, or info. ``None`` is used
as the key for messages not associated with a specific document
(e.g., orphaned files).
"""
def __init__(self) -> None:
self._messages: dict[int | None, list[MessageEntry]] = defaultdict(list)
self.has_error: bool = False
self.has_warning: bool = False
self.has_info: bool = False
self.document_count: int = 0
self.document_error_count: int = 0
self.document_warning_count: int = 0
self.document_info_count: int = 0
self.global_warning_count: int = 0
self._messages: dict[int, list[dict]] = defaultdict(list)
self.has_error = False
self.has_warning = False
# -- Recording ----------------------------------------------------------
def error(self, doc_pk: int | None, message: str) -> None:
def error(self, doc_pk, message) -> None:
self._messages[doc_pk].append({"level": logging.ERROR, "message": message})
self.has_error = True
if doc_pk is not None:
self.document_count += 1
self.document_error_count += 1
def warning(self, doc_pk: int | None, message: str) -> None:
def warning(self, doc_pk, message) -> None:
self._messages[doc_pk].append({"level": logging.WARNING, "message": message})
self.has_warning = True
if doc_pk is not None:
self.document_count += 1
self.document_warning_count += 1
else:
# This is the only type of global message we do right now
self.global_warning_count += 1
def info(self, doc_pk: int | None, message: str) -> None:
def info(self, doc_pk, message) -> None:
self._messages[doc_pk].append({"level": logging.INFO, "message": message})
self.has_info = True
if doc_pk is not None:
self.document_count += 1
self.document_info_count += 1
# -- Iteration / query --------------------------------------------------
def document_pks(self) -> list[int | None]:
"""Return all document PKs (including None for global messages)."""
return list(self._messages.keys())
def iter_messages(self) -> Iterator[tuple[int | None, list[MessageEntry]]]:
"""Iterate over (doc_pk, messages) pairs."""
yield from self._messages.items()
def __getitem__(self, item: int | None) -> list[MessageEntry]:
return self._messages[item]
# -- Summarize Helpers --------------------------------------------------
@property
def has_global_issues(self) -> bool:
return None in self._messages
@property
def total_issue_count(self) -> int:
"""Total number of error and warning messages across all documents and global."""
return (
self.document_error_count
+ self.document_warning_count
+ self.global_warning_count
)
# -- Logging output (used by Celery task path) --------------------------
def log_messages(self) -> None:
"""Write all messages to the ``paperless.sanity_checker`` logger.
logger = logging.getLogger("paperless.sanity_checker")
This is the output path for headless / Celery execution.
Management commands use Rich rendering instead.
"""
if len(self._messages) == 0:
logger.info("Sanity checker detected no issues.")
return
else:
# Query once
all_docs = Document.global_objects.all()
doc_pks = [pk for pk in self._messages if pk is not None]
titles: dict[int, str] = {}
if doc_pks:
titles = dict(
Document.global_objects.filter(pk__in=doc_pks)
.only("pk", "title")
.values_list("pk", "title"),
)
for doc_pk in self._messages:
if doc_pk is not None:
doc = all_docs.get(pk=doc_pk)
logger.info(
f"Detected following issue(s) with document #{doc.pk},"
f" titled {doc.title}",
)
for msg in self._messages[doc_pk]:
logger.log(msg["level"], msg["message"])
for doc_pk, entries in self._messages.items():
if doc_pk is not None:
title = titles.get(doc_pk, "Unknown")
logger.info(
"Detected following issue(s) with document #%s, titled %s",
doc_pk,
title,
)
for msg in entries:
logger.log(msg["level"], msg["message"])
def __len__(self):
return len(self._messages)
def __getitem__(self, item):
return self._messages[item]
class SanityCheckFailedException(Exception):
pass
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
paperless_task = PaperlessTask.objects.create(
task_id=uuid.uuid4(),
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_name=PaperlessTask.TaskName.CHECK_SANITY,
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
messages = SanityCheckMessages()
def _build_present_files() -> set[Path]:
"""Collect all files in MEDIA_ROOT, excluding directories and ignorable files."""
present_files = {
x.resolve()
for x in Path(settings.MEDIA_ROOT).glob("**/*")
@@ -176,178 +82,95 @@ def _build_present_files() -> set[Path]:
}
lockfile = Path(settings.MEDIA_LOCK).resolve()
present_files.discard(lockfile)
if lockfile in present_files:
present_files.remove(lockfile)
general_config = GeneralConfig()
app_logo = general_config.app_logo or settings.APP_LOGO
if app_logo:
logo_file = Path(settings.MEDIA_ROOT / Path(app_logo.lstrip("/"))).resolve()
present_files.discard(logo_file)
if logo_file in present_files:
present_files.remove(logo_file)
return present_files
def _check_thumbnail(
doc: Document,
messages: SanityCheckMessages,
present_files: set[Path],
) -> None:
"""Verify the thumbnail exists and is readable."""
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
return
present_files.discard(thumbnail_path)
try:
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
def _check_original(
doc: Document,
messages: SanityCheckMessages,
present_files: set[Path],
) -> None:
"""Verify the original file exists, is readable, and has matching checksum."""
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
return
present_files.discard(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
if checksum != doc.checksum:
messages.error(
doc.pk,
f"Checksum mismatch. Stored: {doc.checksum}, actual: {checksum}.",
)
def _check_archive(
doc: Document,
messages: SanityCheckMessages,
present_files: set[Path],
) -> None:
"""Verify archive file consistency: checksum/filename pairing and file integrity."""
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
if TYPE_CHECKING:
assert isinstance(doc.archive_path, Path)
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(doc.pk, "Archived version of document does not exist.")
return
present_files.discard(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document: {e}",
)
for doc in tqdm(Document.global_objects.all(), disable=not progress):
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
else:
if checksum != doc.archive_checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, actual: {checksum}.",
)
if thumbnail_path in present_files:
present_files.remove(thumbnail_path)
try:
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
# Check sanity of the original file
# TODO: extract method
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
else:
if source_path in present_files:
present_files.remove(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
if checksum != doc.checksum:
messages.error(
doc.pk,
"Checksum mismatch. "
f"Stored: {doc.checksum}, actual: {checksum}.",
)
def _check_content(doc: Document, messages: SanityCheckMessages) -> None:
"""Flag documents with no OCR content."""
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
# Check sanity of the archive file.
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(doc.pk, "Archived version of document does not exist.")
else:
if archive_path in present_files:
present_files.remove(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document : {e}",
)
else:
if checksum != doc.archive_checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}.",
)
def _check_document(
doc: Document,
messages: SanityCheckMessages,
present_files: set[Path],
) -> None:
"""Run all checks for a single document."""
_check_thumbnail(doc, messages, present_files)
_check_original(doc, messages, present_files)
_check_archive(doc, messages, present_files)
_check_content(doc, messages)
# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------
def check_sanity(
*,
scheduled: bool = True,
iter_wrapper: IterWrapper[Document] = _identity,
) -> SanityCheckMessages:
"""Run a full sanity check on the document archive.
Args:
scheduled: Whether this is a scheduled (automatic) or manual check.
Controls the task type recorded in the database.
iter_wrapper: A callable that wraps the document iterable, e.g.,
for progress bar display. Defaults to identity (no wrapping).
Returns:
A SanityCheckMessages instance containing all detected issues.
"""
paperless_task = PaperlessTask.objects.create(
task_id=uuid.uuid4(),
type=(
PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK
),
task_name=PaperlessTask.TaskName.CHECK_SANITY,
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
messages = SanityCheckMessages()
present_files = _build_present_files()
documents = Document.global_objects.all()
for doc in iter_wrapper(documents):
_check_document(doc, messages, present_files)
# other document checks
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
for extra_file in present_files:
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
paperless_task.status = states.SUCCESS if not messages.has_error else states.FAILURE
if messages.total_issue_count == 0:
paperless_task.result = "No issues found."
else:
parts: list[str] = []
if messages.document_error_count:
parts.append(f"{messages.document_error_count} document(s) with errors")
if messages.document_warning_count:
parts.append(f"{messages.document_warning_count} document(s) with warnings")
if messages.global_warning_count:
parts.append(f"{messages.global_warning_count} global warning(s)")
paperless_task.result = ", ".join(parts) + " found."
if messages.has_error:
paperless_task.result += " Check logs for details."
# result is concatenated messages
paperless_task.result = f"{len(messages)} issues found."
if messages.has_error:
paperless_task.result += " Check logs for details."
paperless_task.date_done = timezone.now()
paperless_task.save(update_fields=["status", "result", "date_done"])
return messages

View File

@@ -596,16 +596,6 @@ def update_filename_and_move_files(
root=settings.ARCHIVE_DIR,
)
# Keep version files in sync with root
if instance.root_document_id is None:
for version_doc in Document.objects.filter(root_document_id=instance.pk).only(
"pk",
):
update_filename_and_move_files(
Document,
version_doc,
)
@shared_task
def process_cf_select_update(custom_field: CustomField) -> None:

View File

@@ -4,13 +4,11 @@ import logging
import shutil
import uuid
import zipfile
from collections.abc import Callable
from collections.abc import Iterable
from pathlib import Path
from tempfile import TemporaryDirectory
from tempfile import mkstemp
from typing import TypeVar
import tqdm
from celery import Task
from celery import shared_task
from celery import states
@@ -68,19 +66,11 @@ from paperless_ai.indexing import llm_index_add_or_update_document
from paperless_ai.indexing import llm_index_remove_document
from paperless_ai.indexing import update_llm_index
_T = TypeVar("_T")
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry
logger = logging.getLogger("paperless.tasks")
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
return iterable
@shared_task
def index_optimize() -> None:
ix = index.open_index()
@@ -88,13 +78,13 @@ def index_optimize() -> None:
writer.commit(optimize=True)
def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
def index_reindex(*, progress_bar_disable=False) -> None:
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in iter_wrapper(documents):
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
index.update_document(writer, document)
@@ -237,30 +227,20 @@ def consume_file(
@shared_task
def sanity_check(*, scheduled=True, raise_on_error=True):
messages = sanity_checker.check_sanity(scheduled=scheduled)
messages.log_messages()
if not messages.has_error and not messages.has_warning and not messages.has_info:
return "No issues detected."
parts: list[str] = []
if messages.document_error_count:
parts.append(f"{messages.document_error_count} document(s) with errors")
if messages.document_warning_count:
parts.append(f"{messages.document_warning_count} document(s) with warnings")
if messages.document_info_count:
parts.append(f"{messages.document_info_count} document(s) with infos")
if messages.global_warning_count:
parts.append(f"{messages.global_warning_count} global warning(s)")
summary = ", ".join(parts) + " found."
if messages.has_error:
message = summary + " Check logs for details."
message = "Sanity check exited with errors. See log."
if raise_on_error:
raise SanityCheckFailedException(message)
return message
return summary
elif messages.has_warning:
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:
return "Sanity check exited with infos. See log."
else:
return "No issues detected."
@shared_task
@@ -285,6 +265,7 @@ def bulk_update_documents(document_ids) -> None:
ai_config = AIConfig()
if ai_config.llm_index_enabled:
update_llm_index(
progress_bar_disable=True,
rebuild=False,
)
@@ -625,7 +606,7 @@ def update_document_parent_tags(tag: Tag, new_parent: Tag) -> None:
@shared_task
def llmindex_index(
*,
iter_wrapper: IterWrapper[Document] = _identity,
progress_bar_disable=True,
rebuild=False,
scheduled=True,
auto=False,
@@ -648,7 +629,7 @@ def llmindex_index(
try:
result = update_llm_index(
iter_wrapper=iter_wrapper,
progress_bar_disable=progress_bar_disable,
rebuild=rebuild,
)
task.status = states.SUCCESS

View File

@@ -1,96 +1,10 @@
import shutil
import zoneinfo
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
import filelock
import pytest
from django.contrib.auth import get_user_model
from pytest_django.fixtures import SettingsWrapper
from rest_framework.test import APIClient
from documents.tests.factories import DocumentFactory
if TYPE_CHECKING:
from documents.models import Document
@dataclass(frozen=True, slots=True)
class PaperlessDirs:
"""Standard Paperless-ngx directory layout for tests."""
media: Path
originals: Path
archive: Path
thumbnails: Path
@pytest.fixture(scope="session")
def samples_dir() -> Path:
"""Path to the shared test sample documents."""
return Path(__file__).parent / "samples" / "documents"
@pytest.fixture()
def paperless_dirs(tmp_path: Path) -> PaperlessDirs:
"""Create and return the directory structure for testing."""
media = tmp_path / "media"
dirs = PaperlessDirs(
media=media,
originals=media / "documents" / "originals",
archive=media / "documents" / "archive",
thumbnails=media / "documents" / "thumbnails",
)
for d in (dirs.originals, dirs.archive, dirs.thumbnails):
d.mkdir(parents=True)
return dirs
@pytest.fixture()
def _media_settings(paperless_dirs: PaperlessDirs, settings) -> None:
"""Configure Django settings to point at temp directories."""
settings.MEDIA_ROOT = paperless_dirs.media
settings.ORIGINALS_DIR = paperless_dirs.originals
settings.ARCHIVE_DIR = paperless_dirs.archive
settings.THUMBNAIL_DIR = paperless_dirs.thumbnails
settings.MEDIA_LOCK = paperless_dirs.media / "media.lock"
settings.IGNORABLE_FILES = {".DS_Store", "Thumbs.db", "desktop.ini"}
settings.APP_LOGO = ""
@pytest.fixture()
def sample_doc(
paperless_dirs: PaperlessDirs,
_media_settings: None,
samples_dir: Path,
) -> "Document":
"""Create a document with valid files and matching checksums."""
with filelock.FileLock(paperless_dirs.media / "media.lock"):
shutil.copy(
samples_dir / "originals" / "0000001.pdf",
paperless_dirs.originals / "0000001.pdf",
)
shutil.copy(
samples_dir / "archive" / "0000001.pdf",
paperless_dirs.archive / "0000001.pdf",
)
shutil.copy(
samples_dir / "thumbnails" / "0000001.webp",
paperless_dirs.thumbnails / "0000001.webp",
)
return DocumentFactory(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
content="test content",
pk=1,
filename="0000001.pdf",
mime_type="application/pdf",
archive_filename="0000001.pdf",
)
@pytest.fixture()
def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo:

View File

@@ -1,193 +0,0 @@
"""Tests for the document_sanity_checker management command.
Verifies Rich rendering (table, panel, summary) and end-to-end CLI behavior.
"""
from __future__ import annotations
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING
import pytest
from django.core.management import call_command
from rich.console import Console
from documents.management.commands.document_sanity_checker import Command
from documents.sanity_checker import SanityCheckMessages
from documents.tests.factories import DocumentFactory
if TYPE_CHECKING:
from documents.models import Document
from documents.tests.conftest import PaperlessDirs
def _render_to_string(messages: SanityCheckMessages) -> str:
"""Render command output to a plain string for assertion."""
buf = StringIO()
cmd = Command()
cmd.console = Console(file=buf, width=120, no_color=True)
cmd._render_results(messages)
return buf.getvalue()
# ---------------------------------------------------------------------------
# Rich rendering
# ---------------------------------------------------------------------------
class TestRenderResultsNoIssues:
"""No DB access needed -- renders an empty SanityCheckMessages."""
def test_shows_panel(self) -> None:
output = _render_to_string(SanityCheckMessages())
assert "No issues detected" in output
assert "Sanity Check" in output
@pytest.mark.django_db
class TestRenderResultsWithIssues:
def test_error_row(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.error(sample_doc.pk, "Original missing")
output = _render_to_string(msgs)
assert "Sanity Check Results" in output
assert "ERROR" in output
assert "Original missing" in output
assert f"#{sample_doc.pk}" in output
assert sample_doc.title in output
def test_warning_row(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.warning(sample_doc.pk, "Suspicious file")
output = _render_to_string(msgs)
assert "WARN" in output
assert "Suspicious file" in output
def test_info_row(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.info(sample_doc.pk, "No OCR data")
output = _render_to_string(msgs)
assert "INFO" in output
assert "No OCR data" in output
@pytest.mark.usefixtures("_media_settings")
def test_global_message(self) -> None:
msgs = SanityCheckMessages()
msgs.warning(None, "Orphaned file: /tmp/stray.pdf")
output = _render_to_string(msgs)
assert "(global)" in output
assert "Orphaned file" in output
def test_multiple_messages_same_doc(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.error(sample_doc.pk, "Thumbnail missing")
msgs.error(sample_doc.pk, "Checksum mismatch")
output = _render_to_string(msgs)
assert "Thumbnail missing" in output
assert "Checksum mismatch" in output
@pytest.mark.usefixtures("_media_settings")
def test_unknown_doc_pk(self) -> None:
msgs = SanityCheckMessages()
msgs.error(99999, "Ghost document")
output = _render_to_string(msgs)
assert "#99999" in output
assert "Unknown" in output
@pytest.mark.django_db
class TestRenderResultsSummary:
def test_errors_only(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.error(sample_doc.pk, "broken")
output = _render_to_string(msgs)
assert "1 document(s) with" in output
assert "errors" in output
def test_warnings_only(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.warning(sample_doc.pk, "odd")
output = _render_to_string(msgs)
assert "1 document(s) with" in output
assert "warnings" in output
def test_infos_only(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.info(sample_doc.pk, "no OCR")
output = _render_to_string(msgs)
assert "1 document(s) with infos" in output
def test_empty_messages(self) -> None:
msgs = SanityCheckMessages()
output = _render_to_string(msgs)
assert "No issues detected." in output
def test_document_errors_and_global_warnings(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.error(sample_doc.pk, "broken")
msgs.warning(None, "orphan")
output = _render_to_string(msgs)
assert "1 document(s) with" in output
assert "errors" in output
assert "1 global warning(s)" in output
assert "2 document(s)" not in output
def test_global_warnings_only(self) -> None:
msgs = SanityCheckMessages()
msgs.warning(None, "extra file")
output = _render_to_string(msgs)
assert "1 global warning(s)" in output
assert "document(s) with" not in output
def test_all_levels_combined(self, sample_doc: Document) -> None:
msgs = SanityCheckMessages()
msgs.error(sample_doc.pk, "broken")
msgs.warning(sample_doc.pk, "odd")
msgs.info(sample_doc.pk, "fyi")
msgs.warning(None, "extra file")
output = _render_to_string(msgs)
assert "1 document(s) with errors" in output
assert "1 document(s) with warnings" in output
assert "1 document(s) with infos" in output
assert "1 global warning(s)" in output
# ---------------------------------------------------------------------------
# End-to-end command execution
# ---------------------------------------------------------------------------
@pytest.mark.django_db
@pytest.mark.management
class TestDocumentSanityCheckerCommand:
def test_no_issues(self, sample_doc: Document) -> None:
out = StringIO()
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
assert "No issues detected" in out.getvalue()
def test_missing_original(self, sample_doc: Document) -> None:
Path(sample_doc.source_path).unlink()
out = StringIO()
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
output = out.getvalue()
assert "ERROR" in output
assert "Original of document does not exist" in output
@pytest.mark.usefixtures("_media_settings")
def test_checksum_mismatch(self, paperless_dirs: PaperlessDirs) -> None:
"""Lightweight document with zero-byte files triggers checksum mismatch."""
doc = DocumentFactory(
title="test",
content="test",
filename="test.pdf",
checksum="abc",
)
Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch()
out = StringIO()
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
output = out.getvalue()
assert "ERROR" in output
assert "Checksum mismatch. Stored: abc, actual:" in output

View File

@@ -699,14 +699,6 @@ class TestConsumer(
self.assertIsNotNone(root_doc)
assert root_doc is not None
root_storage_path = StoragePath.objects.create(
name="version-root-path",
path="root/{{title}}",
)
root_doc.storage_path = root_storage_path
root_doc.archive_serial_number = 42
root_doc.save()
actor = User.objects.create_user(
username="actor",
email="actor@example.com",
@@ -743,7 +735,7 @@ class TestConsumer(
)
consumer.setup()
try:
self.assertEqual(consumer.filename, version_file.name)
self.assertTrue(consumer.filename.endswith("_v0.pdf"))
consumer.run()
finally:
consumer.cleanup()
@@ -753,10 +745,8 @@ class TestConsumer(
version = versions.first()
assert version is not None
assert version.original_filename is not None
self.assertEqual(version.version_index, 1)
self.assertEqual(version.version_label, "v2")
self.assertIsNone(version.archive_serial_number)
self.assertEqual(version.original_filename, version_file.name)
self.assertTrue(version.original_filename.endswith("_v0.pdf"))
self.assertTrue(bool(version.content))
@override_settings(AUDIT_LOG_ENABLED=True)
@@ -805,7 +795,7 @@ class TestConsumer(
)
consumer.setup()
try:
self.assertEqual(consumer.filename, "valid_pdf_version-upload")
self.assertEqual(consumer.filename, "valid_pdf_version-upload_v0")
consumer.run()
finally:
consumer.cleanup()
@@ -815,67 +805,9 @@ class TestConsumer(
)
self.assertIsNotNone(version)
assert version is not None
self.assertEqual(version.version_index, 1)
self.assertEqual(version.original_filename, "valid_pdf_version-upload")
self.assertEqual(version.original_filename, "valid_pdf_version-upload_v0")
self.assertTrue(bool(version.content))
@override_settings(AUDIT_LOG_ENABLED=True)
@mock.patch("documents.consumer.load_classifier")
def test_consume_version_index_monotonic_after_version_deletion(self, m) -> None:
m.return_value = MagicMock()
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
root_doc = Document.objects.first()
self.assertIsNotNone(root_doc)
assert root_doc is not None
def consume_version(version_file: Path) -> Document:
status = DummyProgressManager(version_file.name, None)
overrides = DocumentMetadataOverrides()
doc = ConsumableDocument(
DocumentSource.ApiUpload,
original_file=version_file,
root_document_id=root_doc.pk,
)
preflight = ConsumerPreflightPlugin(
doc,
overrides,
status, # type: ignore[arg-type]
self.dirs.scratch_dir,
"task-id",
)
preflight.setup()
preflight.run()
consumer = ConsumerPlugin(
doc,
overrides,
status, # type: ignore[arg-type]
self.dirs.scratch_dir,
"task-id",
)
consumer.setup()
try:
consumer.run()
finally:
consumer.cleanup()
version = (
Document.objects.filter(root_document=root_doc).order_by("-id").first()
)
assert version is not None
return version
v1 = consume_version(self.get_test_file2())
self.assertEqual(v1.version_index, 1)
v1.delete()
# The next version should have version_index 2, even though version_index 1 was deleted
v2 = consume_version(self.get_test_file())
self.assertEqual(v2.version_index, 2)
@mock.patch("documents.consumer.load_classifier")
def testClassifyDocument(self, m) -> None:
correspondent = Correspondent.objects.create(

View File

@@ -77,58 +77,6 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
settings.ORIGINALS_DIR / "test" / "test.pdf",
)
@override_settings(FILENAME_FORMAT=None)
def test_root_storage_path_change_updates_version_files(self) -> None:
old_storage_path = StoragePath.objects.create(
name="old-path",
path="old/{{title}}",
)
new_storage_path = StoragePath.objects.create(
name="new-path",
path="new/{{title}}",
)
root_doc = Document.objects.create(
title="rootdoc",
mime_type="application/pdf",
checksum="root-checksum",
storage_path=old_storage_path,
)
version_doc = Document.objects.create(
title="version-title",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
)
Document.objects.filter(pk=root_doc.pk).update(
filename=generate_filename(root_doc),
)
Document.objects.filter(pk=version_doc.pk).update(
filename=generate_filename(version_doc),
)
root_doc.refresh_from_db()
version_doc.refresh_from_db()
create_source_path_directory(root_doc.source_path)
Path(root_doc.source_path).touch()
create_source_path_directory(version_doc.source_path)
Path(version_doc.source_path).touch()
root_doc.storage_path = new_storage_path
root_doc.save()
root_doc.refresh_from_db()
version_doc.refresh_from_db()
self.assertEqual(root_doc.filename, "new/rootdoc.pdf")
self.assertEqual(version_doc.filename, "new/rootdoc_v1.pdf")
self.assertIsFile(root_doc.source_path)
self.assertIsFile(version_doc.source_path)
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc.pdf")
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc_v1.pdf")
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self) -> None:
document = Document()
@@ -1274,94 +1222,6 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
Path("logs.pdf"),
)
@override_settings(FILENAME_FORMAT="{title}")
def test_version_index_suffix_for_template_filename(self) -> None:
root_doc = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
checksum="root-checksum",
)
version_doc = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
)
self.assertEqual(generate_filename(version_doc), Path("the_doc_v1.pdf"))
self.assertEqual(
generate_filename(version_doc, counter=1),
Path("the_doc_v1_01.pdf"),
)
@override_settings(FILENAME_FORMAT=None)
def test_version_index_suffix_for_default_filename(self) -> None:
root_doc = Document.objects.create(
title="root",
mime_type="text/plain",
checksum="root-checksum",
)
version_doc = Document.objects.create(
title="root",
mime_type="text/plain",
checksum="version-checksum",
root_document=root_doc,
version_index=2,
)
self.assertEqual(
generate_filename(version_doc),
Path(f"{root_doc.pk:07d}_v2.txt"),
)
self.assertEqual(
generate_filename(version_doc, archive_filename=True),
Path(f"{root_doc.pk:07d}_v2.pdf"),
)
@override_settings(FILENAME_FORMAT="{original_name}")
def test_version_index_suffix_with_original_name_placeholder(self) -> None:
root_doc = Document.objects.create(
title="root",
mime_type="application/pdf",
checksum="root-checksum",
original_filename="root-upload.pdf",
)
version_doc = Document.objects.create(
title="root",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
original_filename="version-upload.pdf",
)
self.assertEqual(generate_filename(version_doc), Path("root-upload_v1.pdf"))
def test_version_index_suffix_with_storage_path(self) -> None:
storage_path = StoragePath.objects.create(
name="vtest",
path="folder/{{title}}",
)
root_doc = Document.objects.create(
title="storage_doc",
mime_type="application/pdf",
checksum="root-checksum",
storage_path=storage_path,
)
version_doc = Document.objects.create(
title="version_title_should_not_be_used",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=3,
)
self.assertEqual(
generate_filename(version_doc),
Path("folder/storage_doc_v3.pdf"),
)
@override_settings(
FILENAME_FORMAT="XX{correspondent}/{title}",
FILENAME_FORMAT_REMOVE_NONE=True,

View File

@@ -134,7 +134,6 @@ class TestRenamer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsFile(doc2.archive_path)
@pytest.mark.management
class TestCreateClassifier(TestCase):
@mock.patch(
"documents.management.commands.document_create_classifier.train_classifier",
@@ -145,6 +144,32 @@ class TestCreateClassifier(TestCase):
m.assert_called_once()
@pytest.mark.management
class TestSanityChecker(DirectoriesMixin, TestCase):
def test_no_issues(self) -> None:
with self.assertLogs() as capture:
call_command("document_sanity_checker")
self.assertEqual(len(capture.output), 1)
self.assertIn("Sanity checker detected no issues.", capture.output[0])
def test_errors(self) -> None:
doc = Document.objects.create(
title="test",
content="test",
filename="test.pdf",
checksum="abc",
)
Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch()
with self.assertLogs() as capture:
call_command("document_sanity_checker")
self.assertEqual(len(capture.output), 2)
self.assertIn("Checksum mismatch. Stored: abc, actual:", capture.output[1])
@pytest.mark.management
class TestConvertMariaDBUUID(TestCase):
@mock.patch("django.db.connection.schema_editor")

View File

@@ -288,7 +288,7 @@ class TestExportImport(
self.assertEqual(Permission.objects.count(), num_permission_objects)
messages = check_sanity()
# everything is alright after the test
self.assertEqual(messages.total_issue_count, 0)
self.assertEqual(len(messages), 0)
def test_exporter_with_filename_format(self) -> None:
shutil.rmtree(Path(self.dirs.media_dir) / "documents")

View File

@@ -1,295 +1,192 @@
"""Tests for the sanity checker module.
Tests exercise ``check_sanity`` as a whole, verifying document validation,
orphan detection, task recording, and the iter_wrapper contract.
"""
from __future__ import annotations
import logging
import shutil
from pathlib import Path
from typing import TYPE_CHECKING
import pytest
import filelock
from django.conf import settings
from django.test import TestCase
from django.test import override_settings
from documents.models import Document
from documents.models import PaperlessTask
from documents.sanity_checker import check_sanity
if TYPE_CHECKING:
from collections.abc import Iterable
from documents.tests.conftest import PaperlessDirs
from documents.tests.utils import DirectoriesMixin
@pytest.mark.django_db
class TestCheckSanityNoDocuments:
"""Sanity checks against an empty archive."""
class TestSanityCheck(DirectoriesMixin, TestCase):
def make_test_data(self):
with filelock.FileLock(settings.MEDIA_LOCK):
# just make sure that the lockfile is present.
shutil.copy(
(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf"
),
Path(self.dirs.originals_dir) / "0000001.pdf",
)
shutil.copy(
(
Path(__file__).parent
/ "samples"
/ "documents"
/ "archive"
/ "0000001.pdf"
),
Path(self.dirs.archive_dir) / "0000001.pdf",
)
shutil.copy(
(
Path(__file__).parent
/ "samples"
/ "documents"
/ "thumbnails"
/ "0000001.webp"
),
Path(self.dirs.thumbnail_dir) / "0000001.webp",
)
@pytest.mark.usefixtures("_media_settings")
def test_no_documents(self) -> None:
return Document.objects.create(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
content="test",
pk=1,
filename="0000001.pdf",
mime_type="application/pdf",
archive_filename="0000001.pdf",
)
def assertSanityError(self, doc: Document, messageRegex) -> None:
messages = check_sanity()
assert not messages.has_error
assert not messages.has_warning
assert messages.total_issue_count == 0
@pytest.mark.usefixtures("_media_settings")
def test_no_issues_logs_clean(self, caplog: pytest.LogCaptureFixture) -> None:
messages = check_sanity()
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
self.assertTrue(messages.has_error)
with self.assertLogs() as capture:
messages.log_messages()
assert "Sanity checker detected no issues." in caplog.text
@pytest.mark.django_db
class TestCheckSanityHealthyDocument:
def test_no_errors(self, sample_doc: Document) -> None:
messages = check_sanity()
assert not messages.has_error
assert not messages.has_warning
assert messages.total_issue_count == 0
@pytest.mark.django_db
class TestCheckSanityThumbnail:
def test_missing(self, sample_doc: Document) -> None:
Path(sample_doc.thumbnail_path).unlink()
messages = check_sanity()
assert messages.has_error
assert any(
"Thumbnail of document does not exist" in m["message"]
for m in messages[sample_doc.pk]
)
def test_unreadable(self, sample_doc: Document) -> None:
thumb = Path(sample_doc.thumbnail_path)
thumb.chmod(0o000)
try:
messages = check_sanity()
assert messages.has_error
assert any(
"Cannot read thumbnail" in m["message"] for m in messages[sample_doc.pk]
self.assertEqual(
capture.records[0].message,
f"Detected following issue(s) with document #{doc.pk}, titled {doc.title}",
)
finally:
thumb.chmod(0o644)
self.assertRegex(capture.records[1].message, messageRegex)
@pytest.mark.django_db
class TestCheckSanityOriginal:
def test_missing(self, sample_doc: Document) -> None:
Path(sample_doc.source_path).unlink()
def test_no_issues(self) -> None:
self.make_test_data()
messages = check_sanity()
assert messages.has_error
assert any(
"Original of document does not exist" in m["message"]
for m in messages[sample_doc.pk]
)
def test_checksum_mismatch(self, sample_doc: Document) -> None:
sample_doc.checksum = "badhash"
sample_doc.save()
messages = check_sanity()
assert messages.has_error
assert any(
"Checksum mismatch" in m["message"] and "badhash" in m["message"]
for m in messages[sample_doc.pk]
)
def test_unreadable(self, sample_doc: Document) -> None:
src = Path(sample_doc.source_path)
src.chmod(0o000)
try:
messages = check_sanity()
assert messages.has_error
assert any(
"Cannot read original" in m["message"] for m in messages[sample_doc.pk]
self.assertFalse(messages.has_error)
self.assertFalse(messages.has_warning)
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.INFO)
self.assertEqual(
capture.records[0].message,
"Sanity checker detected no issues.",
)
finally:
src.chmod(0o644)
def test_no_docs(self) -> None:
self.assertEqual(len(check_sanity()), 0)
@pytest.mark.django_db
class TestCheckSanityArchive:
def test_checksum_without_filename(self, sample_doc: Document) -> None:
sample_doc.archive_filename = None
sample_doc.save()
def test_success(self) -> None:
self.make_test_data()
self.assertEqual(len(check_sanity()), 0)
def test_no_thumbnail(self) -> None:
doc = self.make_test_data()
Path(doc.thumbnail_path).unlink()
self.assertSanityError(doc, "Thumbnail of document does not exist")
def test_thumbnail_no_access(self) -> None:
doc = self.make_test_data()
Path(doc.thumbnail_path).chmod(0o000)
self.assertSanityError(doc, "Cannot read thumbnail file of document")
Path(doc.thumbnail_path).chmod(0o777)
def test_no_original(self) -> None:
doc = self.make_test_data()
Path(doc.source_path).unlink()
self.assertSanityError(doc, "Original of document does not exist.")
def test_original_no_access(self) -> None:
doc = self.make_test_data()
Path(doc.source_path).chmod(0o000)
self.assertSanityError(doc, "Cannot read original file of document")
Path(doc.source_path).chmod(0o777)
def test_original_checksum_mismatch(self) -> None:
doc = self.make_test_data()
doc.checksum = "WOW"
doc.save()
self.assertSanityError(doc, "Checksum mismatch. Stored: WOW, actual: ")
def test_no_archive(self) -> None:
doc = self.make_test_data()
Path(doc.archive_path).unlink()
self.assertSanityError(doc, "Archived version of document does not exist.")
def test_archive_no_access(self) -> None:
doc = self.make_test_data()
Path(doc.archive_path).chmod(0o000)
self.assertSanityError(doc, "Cannot read archive file of document")
Path(doc.archive_path).chmod(0o777)
def test_archive_checksum_mismatch(self) -> None:
doc = self.make_test_data()
doc.archive_checksum = "WOW"
doc.save()
self.assertSanityError(doc, "Checksum mismatch of archived document")
def test_empty_content(self) -> None:
doc = self.make_test_data()
doc.content = ""
doc.save()
messages = check_sanity()
assert messages.has_error
assert any(
"checksum, but no archive filename" in m["message"]
for m in messages[sample_doc.pk]
self.assertFalse(messages.has_error)
self.assertFalse(messages.has_warning)
self.assertEqual(len(messages), 1)
self.assertRegex(
messages[doc.pk][0]["message"],
"Document contains no OCR data",
)
def test_filename_without_checksum(self, sample_doc: Document) -> None:
sample_doc.archive_checksum = None
sample_doc.save()
def test_orphaned_file(self) -> None:
self.make_test_data()
Path(self.dirs.originals_dir, "orphaned").touch()
messages = check_sanity()
assert messages.has_error
assert any(
"checksum is missing" in m["message"] for m in messages[sample_doc.pk]
self.assertTrue(messages.has_warning)
self.assertRegex(
messages._messages[None][0]["message"],
"Orphaned file in media dir",
)
def test_missing_file(self, sample_doc: Document) -> None:
Path(sample_doc.archive_path).unlink()
messages = check_sanity()
assert messages.has_error
assert any(
"Archived version of document does not exist" in m["message"]
for m in messages[sample_doc.pk]
)
def test_checksum_mismatch(self, sample_doc: Document) -> None:
sample_doc.archive_checksum = "wronghash"
sample_doc.save()
messages = check_sanity()
assert messages.has_error
assert any(
"Checksum mismatch of archived document" in m["message"]
for m in messages[sample_doc.pk]
)
def test_unreadable(self, sample_doc: Document) -> None:
archive = Path(sample_doc.archive_path)
archive.chmod(0o000)
try:
messages = check_sanity()
assert messages.has_error
assert any(
"Cannot read archive" in m["message"] for m in messages[sample_doc.pk]
)
finally:
archive.chmod(0o644)
def test_no_archive_at_all(self, sample_doc: Document) -> None:
"""Document with neither archive checksum nor filename is valid."""
Path(sample_doc.archive_path).unlink()
sample_doc.archive_checksum = None
sample_doc.archive_filename = None
sample_doc.save()
messages = check_sanity()
assert not messages.has_error
@pytest.mark.django_db
class TestCheckSanityContent:
@pytest.mark.parametrize(
"content",
[
pytest.param("", id="empty-string"),
],
@override_settings(
APP_LOGO="logo/logo.png",
)
def test_no_content(self, sample_doc: Document, content: str) -> None:
sample_doc.content = content
sample_doc.save()
def test_ignore_logo(self) -> None:
self.make_test_data()
logo_dir = Path(self.dirs.media_dir, "logo")
logo_dir.mkdir(parents=True, exist_ok=True)
Path(self.dirs.media_dir, "logo", "logo.png").touch()
messages = check_sanity()
assert not messages.has_error
assert not messages.has_warning
assert any("no OCR data" in m["message"] for m in messages[sample_doc.pk])
self.assertFalse(messages.has_warning)
@pytest.mark.django_db
class TestCheckSanityOrphans:
def test_orphaned_file(
self,
sample_doc: Document,
paperless_dirs: PaperlessDirs,
) -> None:
(paperless_dirs.originals / "orphan.pdf").touch()
def test_ignore_ignorable_files(self) -> None:
self.make_test_data()
Path(self.dirs.media_dir, ".DS_Store").touch()
Path(self.dirs.media_dir, "desktop.ini").touch()
messages = check_sanity()
assert messages.has_warning
assert any("Orphaned file" in m["message"] for m in messages[None])
self.assertFalse(messages.has_warning)
@pytest.mark.usefixtures("_media_settings")
def test_ignorable_files_not_flagged(
self,
paperless_dirs: PaperlessDirs,
) -> None:
(paperless_dirs.media / ".DS_Store").touch()
(paperless_dirs.media / "desktop.ini").touch()
messages = check_sanity()
assert not messages.has_warning
def test_archive_filename_no_checksum(self) -> None:
doc = self.make_test_data()
doc.archive_checksum = None
doc.save()
self.assertSanityError(doc, "has an archive file, but its checksum is missing.")
@pytest.mark.django_db
class TestCheckSanityIterWrapper:
def test_wrapper_receives_documents(self, sample_doc: Document) -> None:
seen: list[Document] = []
def tracking(iterable: Iterable[Document]) -> Iterable[Document]:
for item in iterable:
seen.append(item)
yield item
check_sanity(iter_wrapper=tracking)
assert len(seen) == 1
assert seen[0].pk == sample_doc.pk
def test_default_works_without_wrapper(self, sample_doc: Document) -> None:
messages = check_sanity()
assert not messages.has_error
@pytest.mark.django_db
class TestCheckSanityTaskRecording:
@pytest.mark.parametrize(
("expected_type", "scheduled"),
[
pytest.param(PaperlessTask.TaskType.SCHEDULED_TASK, True, id="scheduled"),
pytest.param(PaperlessTask.TaskType.MANUAL_TASK, False, id="manual"),
],
)
@pytest.mark.usefixtures("_media_settings")
def test_task_type(self, expected_type: str, *, scheduled: bool) -> None:
check_sanity(scheduled=scheduled)
task = PaperlessTask.objects.latest("date_created")
assert task.task_name == PaperlessTask.TaskName.CHECK_SANITY
assert task.type == expected_type
def test_success_status(self, sample_doc: Document) -> None:
check_sanity()
task = PaperlessTask.objects.latest("date_created")
assert task.status == "SUCCESS"
def test_failure_status(self, sample_doc: Document) -> None:
Path(sample_doc.source_path).unlink()
check_sanity()
task = PaperlessTask.objects.latest("date_created")
assert task.status == "FAILURE"
assert "Check logs for details" in task.result
@pytest.mark.django_db
class TestCheckSanityLogMessages:
def test_logs_doc_issues(
self,
sample_doc: Document,
caplog: pytest.LogCaptureFixture,
) -> None:
Path(sample_doc.source_path).unlink()
messages = check_sanity()
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
messages.log_messages()
assert f"document #{sample_doc.pk}" in caplog.text
assert "Original of document does not exist" in caplog.text
def test_logs_global_issues(
self,
sample_doc: Document,
paperless_dirs: PaperlessDirs,
caplog: pytest.LogCaptureFixture,
) -> None:
(paperless_dirs.originals / "orphan.pdf").touch()
messages = check_sanity()
with caplog.at_level(logging.WARNING, logger="paperless.sanity_checker"):
messages.log_messages()
assert "Orphaned file" in caplog.text
@pytest.mark.usefixtures("_media_settings")
def test_logs_unknown_doc_pk(self, caplog: pytest.LogCaptureFixture) -> None:
"""A doc PK not in the DB logs 'Unknown' as the title."""
messages = check_sanity()
messages.error(99999, "Ghost document")
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
messages.log_messages()
assert "#99999" in caplog.text
assert "Unknown" in caplog.text
def test_archive_checksum_no_filename(self) -> None:
doc = self.make_test_data()
doc.archive_filename = None
doc.save()
self.assertSanityError(
doc,
"has an archive file checksum, but no archive filename.",
)

View File

@@ -3,7 +3,6 @@ from datetime import timedelta
from pathlib import Path
from unittest import mock
import pytest
from celery import states
from django.conf import settings
from django.test import TestCase
@@ -106,83 +105,55 @@ class TestClassifier(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertNotEqual(mtime2, mtime3)
@pytest.mark.django_db
class TestSanityCheck:
@pytest.fixture
def mock_check_sanity(self, mocker) -> mock.MagicMock:
return mocker.patch("documents.tasks.sanity_checker.check_sanity")
class TestSanityCheck(DirectoriesMixin, TestCase):
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_success(self, m) -> None:
m.return_value = SanityCheckMessages()
self.assertEqual(tasks.sanity_check(), "No issues detected.")
m.assert_called_once()
def test_sanity_check_success(self, mock_check_sanity: mock.MagicMock) -> None:
mock_check_sanity.return_value = SanityCheckMessages()
assert tasks.sanity_check() == "No issues detected."
mock_check_sanity.assert_called_once()
def test_sanity_check_error_raises(
self,
mock_check_sanity: mock.MagicMock,
sample_doc: Document,
) -> None:
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error(self, m) -> None:
messages = SanityCheckMessages()
messages.error(sample_doc.pk, "some error")
mock_check_sanity.return_value = messages
with pytest.raises(SanityCheckFailedException):
tasks.sanity_check()
mock_check_sanity.assert_called_once()
messages.error(None, "Some error")
m.return_value = messages
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
m.assert_called_once()
def test_sanity_check_error_no_raise(
self,
mock_check_sanity: mock.MagicMock,
sample_doc: Document,
) -> None:
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_error_no_raise(self, m) -> None:
messages = SanityCheckMessages()
messages.error(sample_doc.pk, "some error")
mock_check_sanity.return_value = messages
messages.error(None, "Some error")
m.return_value = messages
# No exception should be raised
result = tasks.sanity_check(raise_on_error=False)
assert "1 document(s) with errors" in result
assert "Check logs for details." in result
mock_check_sanity.assert_called_once()
self.assertEqual(
result,
"Sanity check exited with errors. See log.",
)
m.assert_called_once()
def test_sanity_check_warning_only(
self,
mock_check_sanity: mock.MagicMock,
) -> None:
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_warning(self, m) -> None:
messages = SanityCheckMessages()
messages.warning(None, "extra file")
mock_check_sanity.return_value = messages
result = tasks.sanity_check()
assert result == "1 global warning(s) found."
mock_check_sanity.assert_called_once()
messages.warning(None, "Some warning")
m.return_value = messages
self.assertEqual(
tasks.sanity_check(),
"Sanity check exited with warnings. See log.",
)
m.assert_called_once()
def test_sanity_check_info_only(
self,
mock_check_sanity: mock.MagicMock,
sample_doc: Document,
) -> None:
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_info(self, m) -> None:
messages = SanityCheckMessages()
messages.info(sample_doc.pk, "some info")
mock_check_sanity.return_value = messages
result = tasks.sanity_check()
assert result == "1 document(s) with infos found."
mock_check_sanity.assert_called_once()
def test_sanity_check_errors_warnings_and_infos(
self,
mock_check_sanity: mock.MagicMock,
sample_doc: Document,
) -> None:
messages = SanityCheckMessages()
messages.error(sample_doc.pk, "broken")
messages.warning(sample_doc.pk, "odd")
messages.info(sample_doc.pk, "fyi")
messages.warning(None, "extra file")
mock_check_sanity.return_value = messages
result = tasks.sanity_check(raise_on_error=False)
assert "1 document(s) with errors" in result
assert "1 document(s) with warnings" in result
assert "1 document(s) with infos" in result
assert "1 global warning(s)" in result
assert "Check logs for details." in result
mock_check_sanity.assert_called_once()
messages.info(None, "Some info")
m.return_value = messages
self.assertEqual(
tasks.sanity_check(),
"Sanity check exited with infos. See log.",
)
m.assert_called_once()
class TestBulkUpdate(DirectoriesMixin, TestCase):

View File

@@ -378,6 +378,7 @@ class ApplicationConfigurationViewSet(ModelViewSet):
):
# AI index was just enabled and vector store file does not exist
llmindex_index.delay(
progress_bar_disable=True,
rebuild=True,
scheduled=False,
auto=True,

View File

@@ -1,13 +1,11 @@
import logging
import shutil
from collections.abc import Callable
from collections.abc import Iterable
from datetime import timedelta
from pathlib import Path
from typing import TypeVar
import faiss
import llama_index.core.settings as llama_settings
import tqdm
from celery import states
from django.conf import settings
from django.utils import timezone
@@ -31,14 +29,6 @@ from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_embedding_dim
from paperless_ai.embedding import get_embedding_model
_T = TypeVar("_T")
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
return iterable
logger = logging.getLogger("paperless_ai.indexing")
@@ -166,11 +156,7 @@ def vector_store_file_exists():
return Path(settings.LLM_INDEX_DIR / "default__vector_store.json").exists()
def update_llm_index(
*,
iter_wrapper: IterWrapper[Document] = _identity,
rebuild=False,
) -> str:
def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
"""
Rebuild or update the LLM index.
"""
@@ -190,7 +176,7 @@ def update_llm_index(
embed_model = get_embedding_model()
llama_settings.Settings.embed_model = embed_model
storage_context = get_or_create_storage_context(rebuild=True)
for document in iter_wrapper(documents):
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
document_nodes = build_document_node(document)
nodes.extend(document_nodes)
@@ -198,7 +184,7 @@ def update_llm_index(
nodes=nodes,
storage_context=storage_context,
embed_model=embed_model,
show_progress=False,
show_progress=not progress_bar_disable,
)
msg = "LLM index rebuilt successfully."
else:
@@ -210,7 +196,7 @@ def update_llm_index(
for node in index.docstore.get_nodes(all_node_ids)
}
for document in iter_wrapper(documents):
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
doc_id = str(document.id)
document_modified = document.modified.isoformat()

6
uv.lock generated
View File

@@ -5906,11 +5906,11 @@ wheels = [
[[package]]
name = "whitenoise"
version = "6.12.0"
version = "6.11.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/cb/2a/55b3f3a4ec326cd077c1c3defeee656b9298372a69229134d930151acd01/whitenoise-6.12.0.tar.gz", hash = "sha256:f723ebb76a112e98816ff80fcea0a6c9b8ecde835f8ddda25df7a30a3c2db6ad", size = 26841, upload-time = "2026-02-27T00:05:42.028Z" }
sdist = { url = "https://files.pythonhosted.org/packages/15/95/8c81ec6b6ebcbf8aca2de7603070ccf37dbb873b03f20708e0f7c1664bc6/whitenoise-6.11.0.tar.gz", hash = "sha256:0f5bfce6061ae6611cd9396a8231e088722e4fc67bc13a111be74c738d99375f", size = 26432, upload-time = "2025-09-18T09:16:10.995Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/db/eb/d5583a11486211f3ebd4b385545ae787f32363d453c19fffd81106c9c138/whitenoise-6.12.0-py3-none-any.whl", hash = "sha256:fc5e8c572e33ebf24795b47b6a7da8da3c00cff2349f5b04c02f28d0cc5a3cc2", size = 20302, upload-time = "2026-02-27T00:05:40.086Z" },
{ url = "https://files.pythonhosted.org/packages/6c/e9/4366332f9295fe0647d7d3251ce18f5615fbcb12d02c79a26f8dba9221b3/whitenoise-6.11.0-py3-none-any.whl", hash = "sha256:b2aeb45950597236f53b5342b3121c5de69c8da0109362aee506ce88e022d258", size = 20197, upload-time = "2025-09-18T09:16:09.754Z" },
]
[[package]]