mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-03 07:46:25 +00:00
Compare commits
5 Commits
fix-drop-v
...
feature-im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
01144b4b1a | ||
|
|
c3dd7615e0 | ||
|
|
8e91f8b802 | ||
|
|
1a680ae84f | ||
|
|
0e8cf4cd9b |
14
.github/workflows/ci-backend.yml
vendored
14
.github/workflows/ci-backend.yml
vendored
@@ -35,18 +35,18 @@ jobs:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Start containers
|
||||
run: |
|
||||
docker compose --file docker/compose/docker-compose.ci-test.yml pull --quiet
|
||||
docker compose --file docker/compose/docker-compose.ci-test.yml up --detach
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "${{ matrix.python-version }}"
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
@@ -83,13 +83,13 @@ jobs:
|
||||
pytest
|
||||
- name: Upload test results to Codecov
|
||||
if: always()
|
||||
uses: codecov/codecov-action@v5.5.2
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
flags: backend-python-${{ matrix.python-version }}
|
||||
files: junit.xml
|
||||
report_type: test_results
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5.5.2
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
flags: backend-python-${{ matrix.python-version }}
|
||||
files: coverage.xml
|
||||
@@ -106,14 +106,14 @@ jobs:
|
||||
DEFAULT_PYTHON: "3.12"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6.0.1
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
with:
|
||||
python-version: "${{ env.DEFAULT_PYTHON }}"
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7.2.1
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
|
||||
8
.github/workflows/ci-docker.yml
vendored
8
.github/workflows/ci-docker.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
ref-name: ${{ steps.ref.outputs.name }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6.0.1
|
||||
- name: Determine ref name
|
||||
id: ref
|
||||
run: |
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6.19.2
|
||||
uses: docker/build-push-action@v6.18.0
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
@@ -152,7 +152,7 @@ jobs:
|
||||
touch "/tmp/digests/${digest#sha256:}"
|
||||
- name: Upload digest
|
||||
if: steps.check-push.outputs.should-push == 'true'
|
||||
uses: actions/upload-artifact@v7.0.0
|
||||
uses: actions/upload-artifact@v6.0.0
|
||||
with:
|
||||
name: digests-${{ matrix.arch }}
|
||||
path: /tmp/digests/*
|
||||
@@ -168,7 +168,7 @@ jobs:
|
||||
packages: write
|
||||
steps:
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v8.0.0
|
||||
uses: actions/download-artifact@v7.0.0
|
||||
with:
|
||||
path: /tmp/digests
|
||||
pattern: digests-*
|
||||
|
||||
12
.github/workflows/ci-docs.yml
vendored
12
.github/workflows/ci-docs.yml
vendored
@@ -33,16 +33,16 @@ jobs:
|
||||
name: Build Documentation
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/configure-pages@v5.0.0
|
||||
- uses: actions/configure-pages@v5
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
@@ -58,7 +58,7 @@ jobs:
|
||||
--frozen \
|
||||
zensical build --clean
|
||||
- name: Upload GitHub Pages artifact
|
||||
uses: actions/upload-pages-artifact@v4.0.0
|
||||
uses: actions/upload-pages-artifact@v4
|
||||
with:
|
||||
path: site
|
||||
name: github-pages-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Deploy GitHub Pages
|
||||
uses: actions/deploy-pages@v4.0.5
|
||||
uses: actions/deploy-pages@v4
|
||||
id: deployment
|
||||
with:
|
||||
artifact_name: github-pages-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
|
||||
44
.github/workflows/ci-frontend.yml
vendored
44
.github/workflows/ci-frontend.yml
vendored
@@ -22,20 +22,20 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -49,19 +49,19 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -83,19 +83,19 @@ jobs:
|
||||
shard-count: [4]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -107,13 +107,13 @@ jobs:
|
||||
run: cd src-ui && pnpm run test --max-workers=2 --shard=${{ matrix.shard-index }}/${{ matrix.shard-count }}
|
||||
- name: Upload test results to Codecov
|
||||
if: always()
|
||||
uses: codecov/codecov-action@v5.5.2
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
flags: frontend-node-${{ matrix.node-version }}
|
||||
directory: src-ui/
|
||||
report_type: test_results
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5.5.2
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
flags: frontend-node-${{ matrix.node-version }}
|
||||
directory: src-ui/coverage/
|
||||
@@ -133,19 +133,19 @@ jobs:
|
||||
shard-count: [2]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -163,19 +163,19 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
|
||||
26
.github/workflows/ci-release.yml
vendored
26
.github/workflows/ci-release.yml
vendored
@@ -28,14 +28,14 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
# ---- Frontend Build ----
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
@@ -47,11 +47,11 @@ jobs:
|
||||
# ---- Backend Setup ----
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
@@ -118,7 +118,7 @@ jobs:
|
||||
sudo chown -R 1000:1000 paperless-ngx/
|
||||
tar -cJf paperless-ngx.tar.xz paperless-ngx/
|
||||
- name: Upload release artifact
|
||||
uses: actions/upload-artifact@v7.0.0
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: release
|
||||
path: dist/paperless-ngx.tar.xz
|
||||
@@ -133,7 +133,7 @@ jobs:
|
||||
version: ${{ steps.get-version.outputs.version }}
|
||||
steps:
|
||||
- name: Download release artifact
|
||||
uses: actions/download-artifact@v8.0.0
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: release
|
||||
path: ./
|
||||
@@ -148,7 +148,7 @@ jobs:
|
||||
fi
|
||||
- name: Create release and changelog
|
||||
id: create-release
|
||||
uses: release-drafter/release-drafter@v6.2.0
|
||||
uses: release-drafter/release-drafter@v6
|
||||
with:
|
||||
name: Paperless-ngx ${{ steps.get-version.outputs.version }}
|
||||
tag: ${{ steps.get-version.outputs.version }}
|
||||
@@ -159,7 +159,7 @@ jobs:
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Upload release archive
|
||||
uses: shogo82148/actions-upload-release-asset@v1.9.2
|
||||
uses: shogo82148/actions-upload-release-asset@v1
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
upload_url: ${{ steps.create-release.outputs.upload_url }}
|
||||
@@ -176,16 +176,16 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: main
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
@@ -218,7 +218,7 @@ jobs:
|
||||
git commit -am "Changelog ${{ needs.publish-release.outputs.version }} - GHA"
|
||||
git push origin ${{ needs.publish-release.outputs.version }}-changelog
|
||||
- name: Create pull request
|
||||
uses: actions/github-script@v8.0.0
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
const { repo, owner } = context.repo;
|
||||
|
||||
6
.github/workflows/codeql-analysis.yml
vendored
6
.github/workflows/codeql-analysis.yml
vendored
@@ -34,10 +34,10 @@ jobs:
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v4.32.5
|
||||
uses: github/codeql-action/init@v4
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
@@ -45,4 +45,4 @@ jobs:
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v4.32.5
|
||||
uses: github/codeql-action/analyze@v4
|
||||
|
||||
4
.github/workflows/crowdin.yml
vendored
4
.github/workflows/crowdin.yml
vendored
@@ -13,11 +13,11 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
token: ${{ secrets.PNGX_BOT_PAT }}
|
||||
- name: crowdin action
|
||||
uses: crowdin/github-action@v2.15.0
|
||||
uses: crowdin/github-action@v2
|
||||
with:
|
||||
upload_translations: false
|
||||
download_translations: true
|
||||
|
||||
8
.github/workflows/pr-bot.yml
vendored
8
.github/workflows/pr-bot.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
steps:
|
||||
- name: Label PR by file path or branch name
|
||||
# see .github/labeler.yml for the labeler config
|
||||
uses: actions/labeler@v6.0.1
|
||||
uses: actions/labeler@v6
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Label by size
|
||||
@@ -26,7 +26,7 @@ jobs:
|
||||
fail_if_xl: 'false'
|
||||
excluded_files: /\.lock$/ /\.txt$/ ^src-ui/pnpm-lock\.yaml$ ^src-ui/messages\.xlf$ ^src/locale/en_US/LC_MESSAGES/django\.po$
|
||||
- name: Label by PR title
|
||||
uses: actions/github-script@v8.0.0
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
const pr = context.payload.pull_request;
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
}
|
||||
- name: Label bot-generated PRs
|
||||
if: ${{ contains(github.actor, 'dependabot') || contains(github.actor, 'crowdin-bot') }}
|
||||
uses: actions/github-script@v8.0.0
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
const pr = context.payload.pull_request;
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
}
|
||||
- name: Welcome comment
|
||||
if: ${{ !contains(github.actor, 'bot') }}
|
||||
uses: actions/github-script@v8.0.0
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
const pr = context.payload.pull_request;
|
||||
|
||||
2
.github/workflows/project-actions.yml
vendored
2
.github/workflows/project-actions.yml
vendored
@@ -19,6 +19,6 @@ jobs:
|
||||
if: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'reopened') && github.event.pull_request.user.login != 'dependabot'
|
||||
steps:
|
||||
- name: Label PR with release-drafter
|
||||
uses: release-drafter/release-drafter@v6.2.0
|
||||
uses: release-drafter/release-drafter@v6
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
10
.github/workflows/repo-maintenance.yml
vendored
10
.github/workflows/repo-maintenance.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/stale@v10.2.0
|
||||
- uses: actions/stale@v10
|
||||
with:
|
||||
days-before-stale: 7
|
||||
days-before-close: 14
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v6.0.0
|
||||
- uses: dessant/lock-threads@v6
|
||||
with:
|
||||
issue-inactive-days: '30'
|
||||
pr-inactive-days: '30'
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/github-script@v8.0.0
|
||||
- uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
function sleep(ms) {
|
||||
@@ -114,7 +114,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/github-script@v8.0.0
|
||||
- uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
function sleep(ms) {
|
||||
@@ -206,7 +206,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/github-script@v8.0.0
|
||||
- uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
function sleep(ms) {
|
||||
|
||||
14
.github/workflows/translate-strings.yml
vendored
14
.github/workflows/translate-strings.yml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
uses: actions/checkout@v6
|
||||
env:
|
||||
GH_REF: ${{ github.ref }} # sonar rule:githubactions:S7630 - avoid injection
|
||||
with:
|
||||
@@ -19,13 +19,13 @@ jobs:
|
||||
ref: ${{ env.GH_REF }}
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6.2.0
|
||||
uses: actions/setup-python@v6
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq --no-install-recommends gettext
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7.3.1
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install backend python dependencies
|
||||
@@ -36,18 +36,18 @@ jobs:
|
||||
- name: Generate backend translation strings
|
||||
run: cd src/ && uv run manage.py makemessages -l en_US -i "samples*"
|
||||
- name: Install pnpm
|
||||
uses: pnpm/action-setup@v4.2.0
|
||||
uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: 10
|
||||
- name: Use Node.js 24
|
||||
uses: actions/setup-node@v6.2.0
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v5.0.3
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
cd src-ui
|
||||
pnpm run ng extract-i18n
|
||||
- name: Commit changes
|
||||
uses: stefanzweifel/git-auto-commit-action@v7.1.0
|
||||
uses: stefanzweifel/git-auto-commit-action@v7
|
||||
with:
|
||||
file_pattern: 'src-ui/messages.xlf src/locale/en_US/LC_MESSAGES/django.po'
|
||||
commit_message: "Auto translate strings"
|
||||
|
||||
@@ -45,7 +45,7 @@ ENV \
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
# Lock this version
|
||||
ARG S6_OVERLAY_VERSION=3.2.2.0
|
||||
ARG S6_OVERLAY_VERSION=3.2.1.0
|
||||
|
||||
ARG S6_BUILD_TIME_PKGS="curl \
|
||||
xz-utils"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# correct networking for the tests
|
||||
services:
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:8.27
|
||||
image: docker.io/gotenberg/gotenberg:8.26
|
||||
hostname: gotenberg
|
||||
container_name: gotenberg
|
||||
network_mode: host
|
||||
|
||||
@@ -72,7 +72,7 @@ services:
|
||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:8.27
|
||||
image: docker.io/gotenberg/gotenberg:8.26
|
||||
restart: unless-stopped
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
# want to allow external content like tracking pixels or even javascript.
|
||||
|
||||
@@ -66,7 +66,7 @@ services:
|
||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:8.27
|
||||
image: docker.io/gotenberg/gotenberg:8.26
|
||||
restart: unless-stopped
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
# want to allow external content like tracking pixels or even javascript.
|
||||
|
||||
@@ -55,7 +55,7 @@ services:
|
||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:8.27
|
||||
image: docker.io/gotenberg/gotenberg:8.26
|
||||
restart: unless-stopped
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
# want to allow external content like tracking pixels or even javascript.
|
||||
|
||||
@@ -262,10 +262,6 @@ your files differently, you can do that by adjusting the
|
||||
or using [storage paths (see below)](#storage-paths). Paperless adds the
|
||||
correct file extension e.g. `.pdf`, `.jpg` automatically.
|
||||
|
||||
When a document has file versions, each version uses the same naming rules and
|
||||
storage path resolution as any other document file, with an added version suffix
|
||||
such as `_v1`, `_v2`, etc.
|
||||
|
||||
This variable allows you to configure the filename (folders are allowed)
|
||||
using placeholders. For example, configuring this to
|
||||
|
||||
@@ -357,8 +353,6 @@ If paperless detects that two documents share the same filename,
|
||||
paperless will automatically append `_01`, `_02`, etc to the filename.
|
||||
This happens if all the placeholders in a filename evaluate to the same
|
||||
value.
|
||||
For versioned files, this counter is appended after the version suffix
|
||||
(for example `statement_v2_01.pdf`).
|
||||
|
||||
If there are any errors in the placeholders included in `PAPERLESS_FILENAME_FORMAT`,
|
||||
paperless will fall back to using the default naming scheme instead.
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 57 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 61 KiB |
@@ -95,7 +95,6 @@ Think of versions as **file history** for a document.
|
||||
|
||||
- Versions track the underlying file and extracted text content (OCR/text).
|
||||
- Metadata such as tags, correspondent, document type, storage path and custom fields stay on the "root" document.
|
||||
- Version files follow normal filename formatting (including storage paths) and add a `_vN` suffix (for example `_v1`, `_v2`).
|
||||
- By default, search and document content use the latest version.
|
||||
- In document detail, selecting a version switches the preview, file metadata and content (and download etc buttons) to that version.
|
||||
- Deleting a non-root version keeps metadata and falls back to the latest remaining version.
|
||||
@@ -617,7 +616,7 @@ applied. You can use the following placeholders in the template with any trigger
|
||||
- `{{added_day}}`: added day
|
||||
- `{{added_time}}`: added time in HH:MM format
|
||||
- `{{original_filename}}`: original file name without extension
|
||||
- `{{filename}}`: current file name without extension (for "added" workflows this may not be final yet, you can use `{{original_filename}}`)
|
||||
- `{{filename}}`: current file name without extension
|
||||
- `{{doc_title}}`: current document title (cannot be used in title assignment)
|
||||
|
||||
The following placeholders are only available for "added" or "updated" triggers
|
||||
|
||||
@@ -111,6 +111,7 @@ docs = [
|
||||
testing = [
|
||||
"daphne",
|
||||
"factory-boy~=3.3.1",
|
||||
"faker~=40.5.1",
|
||||
"imagehash",
|
||||
"pytest~=9.0.0",
|
||||
"pytest-cov~=7.0.0",
|
||||
|
||||
@@ -11,7 +11,6 @@ import magic
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import transaction
|
||||
from django.db.models import Max
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
@@ -124,6 +123,22 @@ class ConsumerPluginMixin:
|
||||
|
||||
self.filename = self.metadata.filename or self.input_doc.original_file.name
|
||||
|
||||
if input_doc.root_document_id:
|
||||
self.log.debug(
|
||||
f"Document root document id: {input_doc.root_document_id}",
|
||||
)
|
||||
root_document = Document.objects.get(pk=input_doc.root_document_id)
|
||||
version_index = Document.objects.filter(root_document=root_document).count()
|
||||
filename_path = Path(self.filename)
|
||||
if filename_path.suffix:
|
||||
self.filename = str(
|
||||
filename_path.with_name(
|
||||
f"{filename_path.stem}_v{version_index}{filename_path.suffix}",
|
||||
),
|
||||
)
|
||||
else:
|
||||
self.filename = f"{self.filename}_v{version_index}"
|
||||
|
||||
def _send_progress(
|
||||
self,
|
||||
current_progress: int,
|
||||
@@ -169,7 +184,7 @@ class ConsumerPlugin(
|
||||
):
|
||||
logging_name = LOGGING_NAME
|
||||
|
||||
def _create_version_from_root(
|
||||
def _clone_root_into_version(
|
||||
self,
|
||||
root_doc: Document,
|
||||
*,
|
||||
@@ -178,38 +193,30 @@ class ConsumerPlugin(
|
||||
mime_type: str,
|
||||
) -> Document:
|
||||
self.log.debug("Saving record for updated version to database")
|
||||
root_doc_frozen = Document.objects.select_for_update().get(pk=root_doc.pk)
|
||||
next_version_index = (
|
||||
Document.global_objects.filter(
|
||||
root_document_id=root_doc_frozen.pk,
|
||||
).aggregate(
|
||||
max_index=Max("version_index"),
|
||||
)["max_index"]
|
||||
or 0
|
||||
)
|
||||
version_doc = Document.objects.get(pk=root_doc.pk)
|
||||
setattr(version_doc, "pk", None)
|
||||
version_doc.root_document = root_doc
|
||||
file_for_checksum = (
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy
|
||||
)
|
||||
version_doc = Document(
|
||||
root_document=root_doc_frozen,
|
||||
version_index=next_version_index + 1,
|
||||
checksum=hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest(),
|
||||
content=text or "",
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
original_filename=self.filename,
|
||||
owner_id=root_doc_frozen.owner_id,
|
||||
created=root_doc_frozen.created,
|
||||
title=root_doc_frozen.title,
|
||||
added=timezone.now(),
|
||||
modified=timezone.now(),
|
||||
)
|
||||
version_doc.checksum = hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest()
|
||||
version_doc.content = text or ""
|
||||
version_doc.page_count = page_count
|
||||
version_doc.mime_type = mime_type
|
||||
version_doc.original_filename = self.filename
|
||||
version_doc.storage_path = root_doc.storage_path
|
||||
# Clear unique file path fields so they can be generated uniquely later
|
||||
version_doc.filename = None
|
||||
version_doc.archive_filename = None
|
||||
version_doc.archive_checksum = None
|
||||
if self.metadata.version_label is not None:
|
||||
version_doc.version_label = self.metadata.version_label
|
||||
version_doc.added = timezone.now()
|
||||
version_doc.modified = timezone.now()
|
||||
return version_doc
|
||||
|
||||
def run_pre_consume_script(self) -> None:
|
||||
@@ -535,7 +542,7 @@ class ConsumerPlugin(
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
original_document = self._clone_root_into_version(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
|
||||
@@ -128,18 +128,11 @@ def generate_filename(
|
||||
counter=0,
|
||||
archive_filename=False,
|
||||
) -> Path:
|
||||
# version docs use the root document for formatting, just with a suffix
|
||||
context_doc = doc if doc.root_document_id is None else doc.root_document
|
||||
version_suffix = (
|
||||
f"_v{doc.version_index}"
|
||||
if doc.root_document_id is not None and doc.version_index is not None
|
||||
else ""
|
||||
)
|
||||
base_path: Path | None = None
|
||||
|
||||
# Determine the source of the format string
|
||||
if context_doc.storage_path is not None:
|
||||
filename_format = context_doc.storage_path.path
|
||||
if doc.storage_path is not None:
|
||||
filename_format = doc.storage_path.path
|
||||
elif settings.FILENAME_FORMAT is not None:
|
||||
# Maybe convert old to new style
|
||||
filename_format = convert_format_str_to_template_format(
|
||||
@@ -150,7 +143,7 @@ def generate_filename(
|
||||
|
||||
# If we have one, render it
|
||||
if filename_format is not None:
|
||||
rendered_path: str | None = format_filename(context_doc, filename_format)
|
||||
rendered_path: str | None = format_filename(doc, filename_format)
|
||||
if rendered_path:
|
||||
base_path = Path(rendered_path)
|
||||
|
||||
@@ -164,7 +157,7 @@ def generate_filename(
|
||||
base_filename = base_path.name
|
||||
|
||||
# Build the final filename with counter and filetype
|
||||
final_filename = f"{base_filename}{version_suffix}{counter_str}{filetype_str}"
|
||||
final_filename = f"{base_filename}{counter_str}{filetype_str}"
|
||||
|
||||
# If we have a directory component, include it
|
||||
if str(directory) != ".":
|
||||
@@ -173,9 +166,7 @@ def generate_filename(
|
||||
full_path = Path(final_filename)
|
||||
else:
|
||||
# No template, use document ID
|
||||
final_filename = (
|
||||
f"{context_doc.pk:07}{version_suffix}{counter_str}{filetype_str}"
|
||||
)
|
||||
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
full_path = Path(final_filename)
|
||||
|
||||
return full_path
|
||||
|
||||
@@ -6,11 +6,14 @@ Provides automatic progress bar and multiprocessing support with minimal boilerp
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Sized
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from concurrent.futures import as_completed
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
@@ -22,7 +25,11 @@ from django import db
|
||||
from django.core.management import CommandError
|
||||
from django.db.models import QuerySet
|
||||
from django_rich.management import RichCommand
|
||||
from rich import box
|
||||
from rich.console import Console
|
||||
from rich.console import Group
|
||||
from rich.console import RenderableType
|
||||
from rich.live import Live
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import MofNCompleteColumn
|
||||
from rich.progress import Progress
|
||||
@@ -30,11 +37,11 @@ from rich.progress import SpinnerColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeElapsedColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Sequence
|
||||
|
||||
from django.core.management import CommandParser
|
||||
@@ -43,6 +50,78 @@ T = TypeVar("T")
|
||||
R = TypeVar("R")
|
||||
|
||||
|
||||
@dataclass(slots=True, frozen=True)
|
||||
class _BufferedRecord:
|
||||
level: int
|
||||
name: str
|
||||
message: str
|
||||
|
||||
|
||||
class BufferingLogHandler(logging.Handler):
|
||||
"""Captures log records during a command run for deferred rendering.
|
||||
|
||||
Attach to a logger before a long operation and call ``render()``
|
||||
afterwards to emit the buffered records via Rich, optionally filtered
|
||||
by minimum level.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._records: list[_BufferedRecord] = []
|
||||
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
self._records.append(
|
||||
_BufferedRecord(
|
||||
level=record.levelno,
|
||||
name=record.name,
|
||||
message=self.format(record),
|
||||
),
|
||||
)
|
||||
|
||||
def render(
|
||||
self,
|
||||
console: Console,
|
||||
*,
|
||||
min_level: int = logging.DEBUG,
|
||||
title: str = "Log Output",
|
||||
) -> None:
|
||||
records = [r for r in self._records if r.level >= min_level]
|
||||
if not records:
|
||||
return
|
||||
|
||||
table = Table(
|
||||
title=title,
|
||||
show_header=True,
|
||||
header_style="bold",
|
||||
show_lines=False,
|
||||
box=box.SIMPLE,
|
||||
)
|
||||
table.add_column("Level", style="bold", width=8)
|
||||
table.add_column("Logger", style="dim")
|
||||
table.add_column("Message", no_wrap=False)
|
||||
|
||||
_level_styles: dict[int, str] = {
|
||||
logging.DEBUG: "dim",
|
||||
logging.INFO: "cyan",
|
||||
logging.WARNING: "yellow",
|
||||
logging.ERROR: "red",
|
||||
logging.CRITICAL: "bold red",
|
||||
}
|
||||
|
||||
for record in records:
|
||||
style = _level_styles.get(record.level, "")
|
||||
table.add_row(
|
||||
Text(logging.getLevelName(record.level), style=style),
|
||||
record.name,
|
||||
record.message,
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
def clear(self) -> None:
|
||||
self._records.clear()
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ProcessResult(Generic[T, R]):
|
||||
"""
|
||||
@@ -91,6 +170,23 @@ class PaperlessCommand(RichCommand):
|
||||
for result in self.process_parallel(process_doc, ids):
|
||||
if result.error:
|
||||
self.console.print(f"[red]Failed: {result.error}[/red]")
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
help = "Import documents with live stats"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
stats = ImportStats()
|
||||
|
||||
def render_stats() -> Table:
|
||||
... # build Rich Table from stats
|
||||
|
||||
for item in self.track_with_stats(
|
||||
items,
|
||||
description="Importing...",
|
||||
stats_renderer=render_stats,
|
||||
):
|
||||
result = import_item(item)
|
||||
stats.imported += 1
|
||||
"""
|
||||
|
||||
supports_progress_bar: ClassVar[bool] = True
|
||||
@@ -128,13 +224,11 @@ class PaperlessCommand(RichCommand):
|
||||
This is called by Django's command infrastructure after argument parsing
|
||||
but before handle(). We use it to set instance attributes from options.
|
||||
"""
|
||||
# Set progress bar state
|
||||
if self.supports_progress_bar:
|
||||
self.no_progress_bar = options.get("no_progress_bar", False)
|
||||
else:
|
||||
self.no_progress_bar = True
|
||||
|
||||
# Set multiprocessing state
|
||||
if self.supports_multiprocessing:
|
||||
self.process_count = options.get("processes", 1)
|
||||
if self.process_count < 1:
|
||||
@@ -144,9 +238,69 @@ class PaperlessCommand(RichCommand):
|
||||
|
||||
return super().execute(*args, **options)
|
||||
|
||||
@contextmanager
|
||||
def buffered_logging(
|
||||
self,
|
||||
*logger_names: str,
|
||||
level: int = logging.DEBUG,
|
||||
) -> Generator[BufferingLogHandler, None, None]:
|
||||
"""Context manager that captures log output from named loggers.
|
||||
|
||||
Installs a ``BufferingLogHandler`` on each named logger for the
|
||||
duration of the block, suppressing propagation to avoid interleaving
|
||||
with the Rich live display. The handler is removed on exit regardless
|
||||
of whether an exception occurred.
|
||||
|
||||
Usage::
|
||||
|
||||
with self.buffered_logging("paperless", "documents") as log_buf:
|
||||
# ... run progress loop ...
|
||||
if options["verbose"]:
|
||||
log_buf.render(self.console)
|
||||
"""
|
||||
handler = BufferingLogHandler()
|
||||
handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
|
||||
loggers: list[logging.Logger] = []
|
||||
original_propagate: dict[str, bool] = {}
|
||||
|
||||
for name in logger_names:
|
||||
log = logging.getLogger(name)
|
||||
log.addHandler(handler)
|
||||
original_propagate[name] = log.propagate
|
||||
log.propagate = False
|
||||
loggers.append(log)
|
||||
|
||||
try:
|
||||
yield handler
|
||||
finally:
|
||||
for log in loggers:
|
||||
log.removeHandler(handler)
|
||||
log.propagate = original_propagate[log.name]
|
||||
|
||||
@staticmethod
|
||||
def _progress_columns() -> tuple[Any, ...]:
|
||||
"""
|
||||
Return the standard set of progress bar columns.
|
||||
|
||||
Extracted so both _create_progress (standalone) and track_with_stats
|
||||
(inside Live) use identical column configuration without duplication.
|
||||
"""
|
||||
return (
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
TimeRemainingColumn(),
|
||||
)
|
||||
|
||||
def _create_progress(self, description: str) -> Progress:
|
||||
"""
|
||||
Create a configured Progress instance.
|
||||
Create a standalone Progress instance with its own stderr Console.
|
||||
|
||||
Use this for track(). For track_with_stats(), Progress is created
|
||||
directly inside a Live context instead.
|
||||
|
||||
Progress output is directed to stderr to match the convention that
|
||||
progress bars are transient UI feedback, not command output. This
|
||||
@@ -161,12 +315,7 @@ class PaperlessCommand(RichCommand):
|
||||
A Progress instance configured with appropriate columns.
|
||||
"""
|
||||
return Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
TimeRemainingColumn(),
|
||||
*self._progress_columns(),
|
||||
console=Console(stderr=True),
|
||||
transient=False,
|
||||
)
|
||||
@@ -222,7 +371,6 @@ class PaperlessCommand(RichCommand):
|
||||
yield from iterable
|
||||
return
|
||||
|
||||
# Attempt to determine total if not provided
|
||||
if total is None:
|
||||
total = self._get_iterable_length(iterable)
|
||||
|
||||
@@ -232,6 +380,87 @@ class PaperlessCommand(RichCommand):
|
||||
yield item
|
||||
progress.advance(task_id)
|
||||
|
||||
def track_with_stats(
|
||||
self,
|
||||
iterable: Iterable[T],
|
||||
*,
|
||||
description: str = "Processing...",
|
||||
stats_renderer: Callable[[], RenderableType],
|
||||
total: int | None = None,
|
||||
) -> Generator[T, None, None]:
|
||||
"""
|
||||
Iterate over items with a progress bar and a live-updating stats display.
|
||||
|
||||
The progress bar and stats renderable are combined in a single Live
|
||||
context, so the stats panel re-renders in place below the progress bar
|
||||
after each item is processed.
|
||||
|
||||
Respects --no-progress-bar flag. When disabled, yields items without
|
||||
any display (stats are still updated by the caller's loop body, so
|
||||
they will be accurate for any post-loop summary the caller prints).
|
||||
|
||||
Args:
|
||||
iterable: The items to iterate over.
|
||||
description: Text to display alongside the progress bar.
|
||||
stats_renderer: Zero-argument callable that returns a Rich
|
||||
renderable. Called after each item to refresh the display.
|
||||
The caller typically closes over a mutable dataclass and
|
||||
rebuilds a Table from it on each call.
|
||||
total: Total number of items. If None, attempts to determine
|
||||
automatically via .count() (for querysets) or len().
|
||||
|
||||
Yields:
|
||||
Items from the iterable.
|
||||
|
||||
Example:
|
||||
@dataclass
|
||||
class Stats:
|
||||
processed: int = 0
|
||||
failed: int = 0
|
||||
|
||||
stats = Stats()
|
||||
|
||||
def render_stats() -> Table:
|
||||
table = Table(box=None)
|
||||
table.add_column("Processed")
|
||||
table.add_column("Failed")
|
||||
table.add_row(str(stats.processed), str(stats.failed))
|
||||
return table
|
||||
|
||||
for item in self.track_with_stats(
|
||||
items,
|
||||
description="Importing...",
|
||||
stats_renderer=render_stats,
|
||||
):
|
||||
try:
|
||||
import_item(item)
|
||||
stats.processed += 1
|
||||
except Exception:
|
||||
stats.failed += 1
|
||||
"""
|
||||
if self.no_progress_bar:
|
||||
yield from iterable
|
||||
return
|
||||
|
||||
if total is None:
|
||||
total = self._get_iterable_length(iterable)
|
||||
|
||||
stderr_console = Console(stderr=True)
|
||||
|
||||
# Progress is created without its own console so Live controls rendering.
|
||||
progress = Progress(*self._progress_columns())
|
||||
task_id = progress.add_task(description, total=total)
|
||||
|
||||
with Live(
|
||||
Group(progress, stats_renderer()),
|
||||
console=stderr_console,
|
||||
refresh_per_second=4,
|
||||
) as live:
|
||||
for item in iterable:
|
||||
yield item
|
||||
progress.advance(task_id)
|
||||
live.update(Group(progress, stats_renderer()))
|
||||
|
||||
def process_parallel(
|
||||
self,
|
||||
fn: Callable[[T], R],
|
||||
@@ -269,7 +498,7 @@ class PaperlessCommand(RichCommand):
|
||||
total = len(items)
|
||||
|
||||
if self.process_count == 1:
|
||||
# Sequential execution in main process - critical for testing
|
||||
# Sequential execution in main process - critical for testing, so we don't fork in fork, etc
|
||||
yield from self._process_sequential(fn, items, description, total)
|
||||
else:
|
||||
# Parallel execution with ProcessPoolExecutor
|
||||
@@ -298,6 +527,7 @@ class PaperlessCommand(RichCommand):
|
||||
total: int,
|
||||
) -> Generator[ProcessResult[T, R], None, None]:
|
||||
"""Process items in parallel using ProcessPoolExecutor."""
|
||||
|
||||
# Close database connections before forking - required for PostgreSQL
|
||||
db.connections.close_all()
|
||||
|
||||
|
||||
@@ -1,25 +1,22 @@
|
||||
from django.core.management import BaseCommand
|
||||
from django.db import transaction
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.tasks import index_optimize
|
||||
from documents.tasks import index_reindex
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "Manages the document index."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("command", choices=["reindex", "optimize"])
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
with transaction.atomic():
|
||||
if options["command"] == "reindex":
|
||||
index_reindex(
|
||||
iter_wrapper=lambda docs: self.track(
|
||||
docs,
|
||||
description="Indexing documents...",
|
||||
),
|
||||
)
|
||||
index_reindex(progress_bar_disable=self.no_progress_bar)
|
||||
elif options["command"] == "optimize":
|
||||
index_optimize()
|
||||
|
||||
@@ -1,22 +1,22 @@
|
||||
from typing import Any
|
||||
from django.core.management import BaseCommand
|
||||
from django.db import transaction
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.tasks import llmindex_index
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "Manages the LLM-based vector index for Paperless."
|
||||
|
||||
def add_arguments(self, parser: Any) -> None:
|
||||
super().add_arguments(parser)
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("command", choices=["rebuild", "update"])
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args: Any, **options: Any) -> None:
|
||||
llmindex_index(
|
||||
rebuild=options["command"] == "rebuild",
|
||||
scheduled=False,
|
||||
iter_wrapper=lambda docs: self.track(
|
||||
docs,
|
||||
description="Indexing documents...",
|
||||
),
|
||||
)
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
with transaction.atomic():
|
||||
llmindex_index(
|
||||
progress_bar_disable=self.no_progress_bar,
|
||||
rebuild=options["command"] == "rebuild",
|
||||
scheduled=False,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from documents.classifier import load_classifier
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
@@ -8,9 +16,162 @@ from documents.signals.handlers import set_document_type
|
||||
from documents.signals.handlers import set_storage_path
|
||||
from documents.signals.handlers import set_tags
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from rich.console import RenderableType
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
|
||||
logger = logging.getLogger("paperless.management.retagger")
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RetaggerStats:
|
||||
"""Cumulative counters updated as the retagger processes documents.
|
||||
|
||||
Mutable by design -- fields are incremented in the processing loop.
|
||||
slots=True reduces per-instance memory overhead and speeds attribute access.
|
||||
"""
|
||||
|
||||
correspondents: int = 0
|
||||
document_types: int = 0
|
||||
tags_added: int = 0
|
||||
tags_removed: int = 0
|
||||
storage_paths: int = 0
|
||||
documents_processed: int = 0
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DocumentSuggestion:
|
||||
"""Buffered classifier suggestions for a single document (suggest mode only).
|
||||
|
||||
Mutable by design -- fields are assigned incrementally as each setter runs.
|
||||
"""
|
||||
|
||||
document: Document
|
||||
correspondent: Correspondent | None = None
|
||||
document_type: DocumentType | None = None
|
||||
tags_to_add: frozenset[Tag] = field(default_factory=frozenset)
|
||||
tags_to_remove: frozenset[Tag] = field(default_factory=frozenset)
|
||||
storage_path: StoragePath | None = None
|
||||
|
||||
@property
|
||||
def has_suggestions(self) -> bool:
|
||||
return bool(
|
||||
self.correspondent is not None
|
||||
or self.document_type is not None
|
||||
or self.tags_to_add
|
||||
or self.tags_to_remove
|
||||
or self.storage_path is not None,
|
||||
)
|
||||
|
||||
|
||||
def _build_stats_table(stats: RetaggerStats, *, suggest: bool) -> Table:
|
||||
"""
|
||||
Build the live-updating stats table shown below the progress bar.
|
||||
|
||||
In suggest mode the labels read "would set / would add" to make clear
|
||||
that nothing has been written to the database.
|
||||
"""
|
||||
table = Table(box=None, padding=(0, 2), show_header=True, header_style="bold")
|
||||
|
||||
table.add_column("Documents")
|
||||
table.add_column("Correspondents")
|
||||
table.add_column("Doc Types")
|
||||
table.add_column("Tags (+)")
|
||||
table.add_column("Tags (-)")
|
||||
table.add_column("Storage Paths")
|
||||
|
||||
verb = "would set" if suggest else "set"
|
||||
|
||||
table.add_row(
|
||||
str(stats.documents_processed),
|
||||
f"{stats.correspondents} {verb}",
|
||||
f"{stats.document_types} {verb}",
|
||||
f"+{stats.tags_added}",
|
||||
f"-{stats.tags_removed}",
|
||||
f"{stats.storage_paths} {verb}",
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _build_suggestion_table(
|
||||
suggestions: list[DocumentSuggestion],
|
||||
base_url: str | None,
|
||||
) -> Table:
|
||||
"""
|
||||
Build the final suggestion table printed after the progress bar completes.
|
||||
|
||||
Only documents with at least one suggestion are included.
|
||||
"""
|
||||
table = Table(
|
||||
title="Suggested Changes",
|
||||
show_header=True,
|
||||
header_style="bold cyan",
|
||||
show_lines=True,
|
||||
)
|
||||
|
||||
table.add_column("Document", style="bold", no_wrap=False, min_width=20)
|
||||
table.add_column("Correspondent")
|
||||
table.add_column("Doc Type")
|
||||
table.add_column("Tags")
|
||||
table.add_column("Storage Path")
|
||||
|
||||
for suggestion in suggestions:
|
||||
if not suggestion.has_suggestions:
|
||||
continue
|
||||
|
||||
doc = suggestion.document
|
||||
|
||||
if base_url:
|
||||
doc_cell = Text()
|
||||
doc_cell.append(str(doc))
|
||||
doc_cell.append(f"\n{base_url}/documents/{doc.pk}", style="dim")
|
||||
else:
|
||||
doc_cell = Text(f"{doc} [{doc.pk}]")
|
||||
|
||||
tag_parts: list[str] = []
|
||||
for tag in sorted(suggestion.tags_to_add, key=lambda t: t.name):
|
||||
tag_parts.append(f"[green]+{tag.name}[/green]")
|
||||
for tag in sorted(suggestion.tags_to_remove, key=lambda t: t.name):
|
||||
tag_parts.append(f"[red]-{tag.name}[/red]")
|
||||
tag_cell = Text.from_markup(", ".join(tag_parts)) if tag_parts else Text("-")
|
||||
|
||||
table.add_row(
|
||||
doc_cell,
|
||||
str(suggestion.correspondent) if suggestion.correspondent else "-",
|
||||
str(suggestion.document_type) if suggestion.document_type else "-",
|
||||
tag_cell,
|
||||
str(suggestion.storage_path) if suggestion.storage_path else "-",
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _build_summary_table(stats: RetaggerStats) -> Table:
|
||||
"""Build the final applied-changes summary table."""
|
||||
table = Table(
|
||||
title="Retagger Summary",
|
||||
show_header=True,
|
||||
header_style="bold cyan",
|
||||
)
|
||||
|
||||
table.add_column("Metric", style="bold")
|
||||
table.add_column("Count", justify="right")
|
||||
|
||||
table.add_row("Documents processed", str(stats.documents_processed))
|
||||
table.add_row("Correspondents set", str(stats.correspondents))
|
||||
table.add_row("Document types set", str(stats.document_types))
|
||||
table.add_row("Tags added", str(stats.tags_added))
|
||||
table.add_row("Tags removed", str(stats.tags_removed))
|
||||
table.add_row("Storage paths set", str(stats.storage_paths))
|
||||
|
||||
return table
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
help = (
|
||||
"Using the current classification model, assigns correspondents, tags "
|
||||
@@ -19,7 +180,7 @@ class Command(PaperlessCommand):
|
||||
"modified) after their initial import."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
def add_arguments(self, parser) -> None:
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
|
||||
parser.add_argument("-T", "--tags", default=False, action="store_true")
|
||||
@@ -31,9 +192,9 @@ class Command(PaperlessCommand):
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"By default this command won't try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag if "
|
||||
"you'd rather it just pick the first one it finds."
|
||||
"By default this command will not try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag to pick "
|
||||
"the first match instead."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -42,91 +203,140 @@ class Command(PaperlessCommand):
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"If set, the document retagger will overwrite any previously "
|
||||
"set correspondent, document and remove correspondents, types "
|
||||
"and tags that do not match anymore due to changed rules."
|
||||
"Overwrite any previously set correspondent, document type, and "
|
||||
"remove tags that no longer match due to changed rules."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--suggest",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Return the suggestion, don't change anything.",
|
||||
help="Show what would be changed without applying anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
help="The base URL to use to build the link to the documents.",
|
||||
help="Base URL used to build document links in suggest output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--id-range",
|
||||
help="A range of document ids on which the retagging should be applied.",
|
||||
help="Restrict retagging to documents within this ID range (inclusive).",
|
||||
nargs=2,
|
||||
type=int,
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
def handle(self, *args, **options) -> None:
|
||||
suggest: bool = options["suggest"]
|
||||
overwrite: bool = options["overwrite"]
|
||||
use_first: bool = options["use_first"]
|
||||
base_url: str | None = options["base_url"]
|
||||
|
||||
do_correspondent: bool = options["correspondent"]
|
||||
do_document_type: bool = options["document_type"]
|
||||
do_tags: bool = options["tags"]
|
||||
do_storage_path: bool = options["storage_path"]
|
||||
|
||||
if not any([do_correspondent, do_document_type, do_tags, do_storage_path]):
|
||||
self.console.print(
|
||||
"[yellow]No classifier targets specified. "
|
||||
"Use -c, -T, -t, or -s to select what to retag.[/yellow]",
|
||||
)
|
||||
return
|
||||
|
||||
if options["inbox_only"]:
|
||||
queryset = Document.objects.filter(tags__is_inbox_tag=True)
|
||||
else:
|
||||
queryset = Document.objects.all()
|
||||
|
||||
if options["id_range"]:
|
||||
queryset = queryset.filter(
|
||||
id__range=(options["id_range"][0], options["id_range"][1]),
|
||||
)
|
||||
lo, hi = options["id_range"]
|
||||
queryset = queryset.filter(id__range=(lo, hi))
|
||||
|
||||
documents = queryset.distinct()
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in self.track(documents, description="Retagging..."):
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
stats = RetaggerStats()
|
||||
suggestions: list[DocumentSuggestion] = []
|
||||
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
def render_stats() -> RenderableType:
|
||||
return _build_stats_table(stats, suggest=suggest)
|
||||
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
with self.buffered_logging(
|
||||
"paperless",
|
||||
"paperless.handlers",
|
||||
"documents",
|
||||
) as log_buf:
|
||||
for document in self.track_with_stats(
|
||||
documents,
|
||||
description="Retagging...",
|
||||
stats_renderer=render_stats,
|
||||
):
|
||||
suggestion = DocumentSuggestion(document=document)
|
||||
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if do_correspondent:
|
||||
correspondent = set_correspondent(
|
||||
None,
|
||||
document,
|
||||
classifier=classifier,
|
||||
replace=overwrite,
|
||||
use_first=use_first,
|
||||
dry_run=suggest,
|
||||
)
|
||||
if correspondent is not None:
|
||||
stats.correspondents += 1
|
||||
suggestion.correspondent = correspondent
|
||||
|
||||
if do_document_type:
|
||||
document_type = set_document_type(
|
||||
None,
|
||||
document,
|
||||
classifier=classifier,
|
||||
replace=overwrite,
|
||||
use_first=use_first,
|
||||
dry_run=suggest,
|
||||
)
|
||||
if document_type is not None:
|
||||
stats.document_types += 1
|
||||
suggestion.document_type = document_type
|
||||
|
||||
if do_tags:
|
||||
tags_to_add, tags_to_remove = set_tags(
|
||||
None,
|
||||
document,
|
||||
classifier=classifier,
|
||||
replace=overwrite,
|
||||
dry_run=suggest,
|
||||
)
|
||||
stats.tags_added += len(tags_to_add)
|
||||
stats.tags_removed += len(tags_to_remove)
|
||||
suggestion.tags_to_add = frozenset(tags_to_add)
|
||||
suggestion.tags_to_remove = frozenset(tags_to_remove)
|
||||
|
||||
if do_storage_path:
|
||||
storage_path = set_storage_path(
|
||||
None,
|
||||
document,
|
||||
classifier=classifier,
|
||||
replace=overwrite,
|
||||
use_first=use_first,
|
||||
dry_run=suggest,
|
||||
)
|
||||
if storage_path is not None:
|
||||
stats.storage_paths += 1
|
||||
suggestion.storage_path = storage_path
|
||||
|
||||
stats.documents_processed += 1
|
||||
|
||||
if suggest:
|
||||
suggestions.append(suggestion)
|
||||
|
||||
# Post-loop output
|
||||
if suggest:
|
||||
visible = [s for s in suggestions if s.has_suggestions]
|
||||
if visible:
|
||||
self.console.print(_build_suggestion_table(visible, base_url))
|
||||
else:
|
||||
self.console.print("[green]No changes suggested.[/green]")
|
||||
else:
|
||||
self.console.print(_build_summary_table(stats))
|
||||
|
||||
log_buf.render(self.console, min_level=logging.INFO, title="Retagger Log")
|
||||
|
||||
@@ -1,117 +1,17 @@
|
||||
"""Management command to check the document archive for issues."""
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import SanityCheckMessages
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.sanity_checker import check_sanity
|
||||
|
||||
_LEVEL_STYLE: dict[int, tuple[str, str]] = {
|
||||
logging.ERROR: ("bold red", "ERROR"),
|
||||
logging.WARNING: ("yellow", "WARN"),
|
||||
logging.INFO: ("dim", "INFO"),
|
||||
}
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "This command checks your document archive for issues."
|
||||
|
||||
def _render_results(self, messages: SanityCheckMessages) -> None:
|
||||
"""Render sanity check results as a Rich table."""
|
||||
def add_arguments(self, parser):
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
if (
|
||||
not messages.has_error
|
||||
and not messages.has_warning
|
||||
and not messages.has_info
|
||||
):
|
||||
self.console.print(
|
||||
Panel(
|
||||
"[green]No issues detected.[/green]",
|
||||
title="Sanity Check",
|
||||
border_style="green",
|
||||
),
|
||||
)
|
||||
return
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
messages = check_sanity(progress=self.use_progress_bar, scheduled=False)
|
||||
|
||||
# Build a lookup for document titles
|
||||
doc_pks = [pk for pk in messages.document_pks() if pk is not None]
|
||||
titles: dict[int, str] = {}
|
||||
if doc_pks:
|
||||
titles = dict(
|
||||
Document.global_objects.filter(pk__in=doc_pks)
|
||||
.only("pk", "title")
|
||||
.values_list("pk", "title"),
|
||||
)
|
||||
|
||||
table = Table(
|
||||
title="Sanity Check Results",
|
||||
show_lines=True,
|
||||
title_style="bold",
|
||||
)
|
||||
table.add_column("Level", width=7, no_wrap=True)
|
||||
table.add_column("Document", min_width=20)
|
||||
table.add_column("Issue", ratio=1)
|
||||
|
||||
for doc_pk, doc_messages in messages.iter_messages():
|
||||
if doc_pk is not None:
|
||||
title = titles.get(doc_pk, "Unknown")
|
||||
doc_label = f"#{doc_pk} {title}"
|
||||
else:
|
||||
doc_label = "(global)"
|
||||
|
||||
for msg in doc_messages:
|
||||
style, label = _LEVEL_STYLE.get(
|
||||
msg["level"],
|
||||
("dim", "INFO"),
|
||||
)
|
||||
table.add_row(
|
||||
Text(label, style=style),
|
||||
Text(doc_label),
|
||||
Text(str(msg["message"])),
|
||||
)
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
parts: list[str] = []
|
||||
|
||||
if messages.document_error_count:
|
||||
parts.append(
|
||||
f"{messages.document_error_count} document(s) with [bold red]errors[/bold red]",
|
||||
)
|
||||
if messages.document_warning_count:
|
||||
parts.append(
|
||||
f"{messages.document_warning_count} document(s) with [yellow]warnings[/yellow]",
|
||||
)
|
||||
if messages.document_info_count:
|
||||
parts.append(f"{messages.document_info_count} document(s) with infos")
|
||||
if messages.global_warning_count:
|
||||
parts.append(
|
||||
f"{messages.global_warning_count} global [yellow]warning(s)[/yellow]",
|
||||
)
|
||||
|
||||
if parts:
|
||||
if len(parts) > 1:
|
||||
summary = ", ".join(parts[:-1]) + " and " + parts[-1]
|
||||
else:
|
||||
summary = parts[0]
|
||||
self.console.print(f"\nFound {summary}.")
|
||||
else:
|
||||
self.console.print("\nNo issues found.")
|
||||
|
||||
def handle(self, *args: Any, **options: Any) -> None:
|
||||
messages = check_sanity(
|
||||
scheduled=False,
|
||||
iter_wrapper=lambda docs: self.track(
|
||||
docs,
|
||||
description="Checking documents...",
|
||||
),
|
||||
)
|
||||
self._render_results(messages)
|
||||
messages.log_messages()
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
# Generated by Django 5.2.11 on 2026-03-02 17:48
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0013_document_root_document"),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="version_index",
|
||||
field=models.PositiveIntegerField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="Index of this version within the root document.",
|
||||
null=True,
|
||||
verbose_name="version index",
|
||||
),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="document",
|
||||
constraint=models.UniqueConstraint(
|
||||
condition=models.Q(
|
||||
("root_document__isnull", False),
|
||||
("version_index__isnull", False),
|
||||
),
|
||||
fields=("root_document", "version_index"),
|
||||
name="documents_document_root_version_index_uniq",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -75,7 +75,7 @@ class MatchingModel(ModelWithOwner):
|
||||
|
||||
is_insensitive = models.BooleanField(_("is insensitive"), default=True)
|
||||
|
||||
class Meta:
|
||||
class Meta(ModelWithOwner.Meta):
|
||||
abstract = True
|
||||
ordering = ("name",)
|
||||
constraints = [
|
||||
@@ -317,14 +317,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
verbose_name=_("root document for this version"),
|
||||
)
|
||||
|
||||
version_index = models.PositiveIntegerField(
|
||||
_("version index"),
|
||||
blank=True,
|
||||
null=True,
|
||||
db_index=True,
|
||||
help_text=_("Index of this version within the root document."),
|
||||
)
|
||||
|
||||
version_label = models.CharField(
|
||||
_("version label"),
|
||||
max_length=64,
|
||||
@@ -337,16 +329,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("document")
|
||||
verbose_name_plural = _("documents")
|
||||
constraints = [
|
||||
models.UniqueConstraint(
|
||||
fields=["root_document", "version_index"],
|
||||
condition=models.Q(
|
||||
root_document__isnull=False,
|
||||
version_index__isnull=False,
|
||||
),
|
||||
name="documents_document_root_version_index_uniq",
|
||||
),
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
created = self.created.isoformat()
|
||||
|
||||
@@ -1,174 +1,80 @@
|
||||
"""
|
||||
Sanity checker for the Paperless-ngx document archive.
|
||||
|
||||
Verifies that all documents have valid files, correct checksums,
|
||||
and consistent metadata. Reports orphaned files in the media directory.
|
||||
|
||||
Progress display is the caller's responsibility -- pass an ``iter_wrapper``
|
||||
to wrap the document queryset (e.g., with a progress bar). The default
|
||||
is an identity function that adds no overhead.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
from typing import TypedDict
|
||||
from typing import TypeVar
|
||||
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from tqdm import tqdm
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
class MessageEntry(TypedDict):
|
||||
"""A single sanity check message with its severity level."""
|
||||
|
||||
level: int
|
||||
message: str
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
"""Pass through an iterable unchanged (default iter_wrapper)."""
|
||||
return iterable
|
||||
|
||||
|
||||
class SanityCheckMessages:
|
||||
"""Collects sanity check messages grouped by document primary key.
|
||||
|
||||
Messages are categorized as error, warning, or info. ``None`` is used
|
||||
as the key for messages not associated with a specific document
|
||||
(e.g., orphaned files).
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._messages: dict[int | None, list[MessageEntry]] = defaultdict(list)
|
||||
self.has_error: bool = False
|
||||
self.has_warning: bool = False
|
||||
self.has_info: bool = False
|
||||
self.document_count: int = 0
|
||||
self.document_error_count: int = 0
|
||||
self.document_warning_count: int = 0
|
||||
self.document_info_count: int = 0
|
||||
self.global_warning_count: int = 0
|
||||
self._messages: dict[int, list[dict]] = defaultdict(list)
|
||||
self.has_error = False
|
||||
self.has_warning = False
|
||||
|
||||
# -- Recording ----------------------------------------------------------
|
||||
|
||||
def error(self, doc_pk: int | None, message: str) -> None:
|
||||
def error(self, doc_pk, message) -> None:
|
||||
self._messages[doc_pk].append({"level": logging.ERROR, "message": message})
|
||||
self.has_error = True
|
||||
if doc_pk is not None:
|
||||
self.document_count += 1
|
||||
self.document_error_count += 1
|
||||
|
||||
def warning(self, doc_pk: int | None, message: str) -> None:
|
||||
def warning(self, doc_pk, message) -> None:
|
||||
self._messages[doc_pk].append({"level": logging.WARNING, "message": message})
|
||||
self.has_warning = True
|
||||
|
||||
if doc_pk is not None:
|
||||
self.document_count += 1
|
||||
self.document_warning_count += 1
|
||||
else:
|
||||
# This is the only type of global message we do right now
|
||||
self.global_warning_count += 1
|
||||
|
||||
def info(self, doc_pk: int | None, message: str) -> None:
|
||||
def info(self, doc_pk, message) -> None:
|
||||
self._messages[doc_pk].append({"level": logging.INFO, "message": message})
|
||||
self.has_info = True
|
||||
|
||||
if doc_pk is not None:
|
||||
self.document_count += 1
|
||||
self.document_info_count += 1
|
||||
|
||||
# -- Iteration / query --------------------------------------------------
|
||||
|
||||
def document_pks(self) -> list[int | None]:
|
||||
"""Return all document PKs (including None for global messages)."""
|
||||
return list(self._messages.keys())
|
||||
|
||||
def iter_messages(self) -> Iterator[tuple[int | None, list[MessageEntry]]]:
|
||||
"""Iterate over (doc_pk, messages) pairs."""
|
||||
yield from self._messages.items()
|
||||
|
||||
def __getitem__(self, item: int | None) -> list[MessageEntry]:
|
||||
return self._messages[item]
|
||||
|
||||
# -- Summarize Helpers --------------------------------------------------
|
||||
|
||||
@property
|
||||
def has_global_issues(self) -> bool:
|
||||
return None in self._messages
|
||||
|
||||
@property
|
||||
def total_issue_count(self) -> int:
|
||||
"""Total number of error and warning messages across all documents and global."""
|
||||
return (
|
||||
self.document_error_count
|
||||
+ self.document_warning_count
|
||||
+ self.global_warning_count
|
||||
)
|
||||
|
||||
# -- Logging output (used by Celery task path) --------------------------
|
||||
|
||||
def log_messages(self) -> None:
|
||||
"""Write all messages to the ``paperless.sanity_checker`` logger.
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
|
||||
This is the output path for headless / Celery execution.
|
||||
Management commands use Rich rendering instead.
|
||||
"""
|
||||
if len(self._messages) == 0:
|
||||
logger.info("Sanity checker detected no issues.")
|
||||
return
|
||||
else:
|
||||
# Query once
|
||||
all_docs = Document.global_objects.all()
|
||||
|
||||
doc_pks = [pk for pk in self._messages if pk is not None]
|
||||
titles: dict[int, str] = {}
|
||||
if doc_pks:
|
||||
titles = dict(
|
||||
Document.global_objects.filter(pk__in=doc_pks)
|
||||
.only("pk", "title")
|
||||
.values_list("pk", "title"),
|
||||
)
|
||||
for doc_pk in self._messages:
|
||||
if doc_pk is not None:
|
||||
doc = all_docs.get(pk=doc_pk)
|
||||
logger.info(
|
||||
f"Detected following issue(s) with document #{doc.pk},"
|
||||
f" titled {doc.title}",
|
||||
)
|
||||
for msg in self._messages[doc_pk]:
|
||||
logger.log(msg["level"], msg["message"])
|
||||
|
||||
for doc_pk, entries in self._messages.items():
|
||||
if doc_pk is not None:
|
||||
title = titles.get(doc_pk, "Unknown")
|
||||
logger.info(
|
||||
"Detected following issue(s) with document #%s, titled %s",
|
||||
doc_pk,
|
||||
title,
|
||||
)
|
||||
for msg in entries:
|
||||
logger.log(msg["level"], msg["message"])
|
||||
def __len__(self):
|
||||
return len(self._messages)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._messages[item]
|
||||
|
||||
|
||||
class SanityCheckFailedException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
||||
paperless_task = PaperlessTask.objects.create(
|
||||
task_id=uuid.uuid4(),
|
||||
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
||||
if scheduled
|
||||
else PaperlessTask.TaskType.MANUAL_TASK,
|
||||
task_name=PaperlessTask.TaskName.CHECK_SANITY,
|
||||
status=states.STARTED,
|
||||
date_created=timezone.now(),
|
||||
date_started=timezone.now(),
|
||||
)
|
||||
messages = SanityCheckMessages()
|
||||
|
||||
|
||||
def _build_present_files() -> set[Path]:
|
||||
"""Collect all files in MEDIA_ROOT, excluding directories and ignorable files."""
|
||||
present_files = {
|
||||
x.resolve()
|
||||
for x in Path(settings.MEDIA_ROOT).glob("**/*")
|
||||
@@ -176,178 +82,95 @@ def _build_present_files() -> set[Path]:
|
||||
}
|
||||
|
||||
lockfile = Path(settings.MEDIA_LOCK).resolve()
|
||||
present_files.discard(lockfile)
|
||||
if lockfile in present_files:
|
||||
present_files.remove(lockfile)
|
||||
|
||||
general_config = GeneralConfig()
|
||||
app_logo = general_config.app_logo or settings.APP_LOGO
|
||||
if app_logo:
|
||||
logo_file = Path(settings.MEDIA_ROOT / Path(app_logo.lstrip("/"))).resolve()
|
||||
present_files.discard(logo_file)
|
||||
if logo_file in present_files:
|
||||
present_files.remove(logo_file)
|
||||
|
||||
return present_files
|
||||
|
||||
|
||||
def _check_thumbnail(
|
||||
doc: Document,
|
||||
messages: SanityCheckMessages,
|
||||
present_files: set[Path],
|
||||
) -> None:
|
||||
"""Verify the thumbnail exists and is readable."""
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
return
|
||||
|
||||
present_files.discard(thumbnail_path)
|
||||
try:
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
|
||||
|
||||
|
||||
def _check_original(
|
||||
doc: Document,
|
||||
messages: SanityCheckMessages,
|
||||
present_files: set[Path],
|
||||
) -> None:
|
||||
"""Verify the original file exists, is readable, and has matching checksum."""
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
return
|
||||
|
||||
present_files.discard(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Checksum mismatch. Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
|
||||
def _check_archive(
|
||||
doc: Document,
|
||||
messages: SanityCheckMessages,
|
||||
present_files: set[Path],
|
||||
) -> None:
|
||||
"""Verify archive file consistency: checksum/filename pairing and file integrity."""
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(doc.archive_path, Path)
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(doc.pk, "Archived version of document does not exist.")
|
||||
return
|
||||
|
||||
present_files.discard(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document: {e}",
|
||||
)
|
||||
for doc in tqdm(Document.global_objects.all(), disable=not progress):
|
||||
# Check sanity of the thumbnail
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, actual: {checksum}.",
|
||||
)
|
||||
if thumbnail_path in present_files:
|
||||
present_files.remove(thumbnail_path)
|
||||
try:
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
else:
|
||||
if source_path in present_files:
|
||||
present_files.remove(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
def _check_content(doc: Document, messages: SanityCheckMessages) -> None:
|
||||
"""Flag documents with no OCR content."""
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(doc.pk, "Archived version of document does not exist.")
|
||||
else:
|
||||
if archive_path in present_files:
|
||||
present_files.remove(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document : {e}",
|
||||
)
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}.",
|
||||
)
|
||||
|
||||
|
||||
def _check_document(
|
||||
doc: Document,
|
||||
messages: SanityCheckMessages,
|
||||
present_files: set[Path],
|
||||
) -> None:
|
||||
"""Run all checks for a single document."""
|
||||
_check_thumbnail(doc, messages, present_files)
|
||||
_check_original(doc, messages, present_files)
|
||||
_check_archive(doc, messages, present_files)
|
||||
_check_content(doc, messages)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def check_sanity(
|
||||
*,
|
||||
scheduled: bool = True,
|
||||
iter_wrapper: IterWrapper[Document] = _identity,
|
||||
) -> SanityCheckMessages:
|
||||
"""Run a full sanity check on the document archive.
|
||||
|
||||
Args:
|
||||
scheduled: Whether this is a scheduled (automatic) or manual check.
|
||||
Controls the task type recorded in the database.
|
||||
iter_wrapper: A callable that wraps the document iterable, e.g.,
|
||||
for progress bar display. Defaults to identity (no wrapping).
|
||||
|
||||
Returns:
|
||||
A SanityCheckMessages instance containing all detected issues.
|
||||
"""
|
||||
paperless_task = PaperlessTask.objects.create(
|
||||
task_id=uuid.uuid4(),
|
||||
type=(
|
||||
PaperlessTask.TaskType.SCHEDULED_TASK
|
||||
if scheduled
|
||||
else PaperlessTask.TaskType.MANUAL_TASK
|
||||
),
|
||||
task_name=PaperlessTask.TaskName.CHECK_SANITY,
|
||||
status=states.STARTED,
|
||||
date_created=timezone.now(),
|
||||
date_started=timezone.now(),
|
||||
)
|
||||
|
||||
messages = SanityCheckMessages()
|
||||
present_files = _build_present_files()
|
||||
|
||||
documents = Document.global_objects.all()
|
||||
for doc in iter_wrapper(documents):
|
||||
_check_document(doc, messages, present_files)
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
|
||||
|
||||
paperless_task.status = states.SUCCESS if not messages.has_error else states.FAILURE
|
||||
if messages.total_issue_count == 0:
|
||||
paperless_task.result = "No issues found."
|
||||
else:
|
||||
parts: list[str] = []
|
||||
if messages.document_error_count:
|
||||
parts.append(f"{messages.document_error_count} document(s) with errors")
|
||||
if messages.document_warning_count:
|
||||
parts.append(f"{messages.document_warning_count} document(s) with warnings")
|
||||
if messages.global_warning_count:
|
||||
parts.append(f"{messages.global_warning_count} global warning(s)")
|
||||
paperless_task.result = ", ".join(parts) + " found."
|
||||
if messages.has_error:
|
||||
paperless_task.result += " Check logs for details."
|
||||
|
||||
# result is concatenated messages
|
||||
paperless_task.result = f"{len(messages)} issues found."
|
||||
if messages.has_error:
|
||||
paperless_task.result += " Check logs for details."
|
||||
paperless_task.date_done = timezone.now()
|
||||
paperless_task.save(update_fields=["status", "result", "date_done"])
|
||||
|
||||
return messages
|
||||
|
||||
@@ -4,6 +4,7 @@ import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
|
||||
from celery import shared_task
|
||||
from celery import states
|
||||
@@ -32,12 +33,14 @@ from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_filename
|
||||
from documents.file_handling import generate_unique_filename
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import DocumentType
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import SavedView
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import UiSettings
|
||||
from documents.models import Workflow
|
||||
@@ -81,47 +84,41 @@ def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs) ->
|
||||
document.add_nested_tags(inbox_tags)
|
||||
|
||||
|
||||
def _suggestion_printer(
|
||||
stdout,
|
||||
style_func,
|
||||
suggestion_type: str,
|
||||
document: Document,
|
||||
selected: MatchingModel,
|
||||
base_url: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Smaller helper to reduce duplication when just outputting suggestions to the console
|
||||
"""
|
||||
doc_str = str(document)
|
||||
if base_url is not None:
|
||||
stdout.write(style_func.SUCCESS(doc_str))
|
||||
stdout.write(style_func.SUCCESS(f"{base_url}/documents/{document.pk}"))
|
||||
else:
|
||||
stdout.write(style_func.SUCCESS(f"{doc_str} [{document.pk}]"))
|
||||
stdout.write(f"Suggest {suggestion_type}: {selected}")
|
||||
|
||||
|
||||
def set_correspondent(
|
||||
sender,
|
||||
sender: object,
|
||||
document: Document,
|
||||
*,
|
||||
logging_group=None,
|
||||
logging_group: object = None,
|
||||
classifier: DocumentClassifier | None = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
stdout=None,
|
||||
style_func=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
replace: bool = False,
|
||||
use_first: bool = True,
|
||||
dry_run: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Correspondent | None:
|
||||
"""
|
||||
Assign a correspondent to a document based on classifier results.
|
||||
|
||||
Args:
|
||||
document: The document to classify.
|
||||
logging_group: Optional logging group for structured log output.
|
||||
classifier: The trained classifier. If None, only rule-based matching runs.
|
||||
replace: If True, overwrite an existing correspondent assignment.
|
||||
use_first: If True, pick the first match when multiple correspondents
|
||||
match. If False, skip assignment when multiple match.
|
||||
dry_run: If True, compute and return the selection without saving.
|
||||
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
|
||||
|
||||
Returns:
|
||||
The correspondent that was (or would be) assigned, or None if no match
|
||||
was found or assignment was skipped.
|
||||
"""
|
||||
if document.correspondent and not replace:
|
||||
return
|
||||
return None
|
||||
|
||||
potential_correspondents = matching.match_correspondents(document, classifier)
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
selected = potential_correspondents[0] if potential_correspondents else None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
logger.debug(
|
||||
@@ -135,49 +132,53 @@ def set_correspondent(
|
||||
f"not assigning any correspondent",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
return
|
||||
return None
|
||||
|
||||
if selected or replace:
|
||||
if suggest:
|
||||
_suggestion_printer(
|
||||
stdout,
|
||||
style_func,
|
||||
"correspondent",
|
||||
document,
|
||||
selected,
|
||||
base_url,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Assigning correspondent {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
if (selected or replace) and not dry_run:
|
||||
logger.info(
|
||||
f"Assigning correspondent {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
document.correspondent = selected
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
document.correspondent = selected
|
||||
document.save(update_fields=("correspondent",))
|
||||
return selected
|
||||
|
||||
|
||||
def set_document_type(
|
||||
sender,
|
||||
sender: object,
|
||||
document: Document,
|
||||
*,
|
||||
logging_group=None,
|
||||
logging_group: object = None,
|
||||
classifier: DocumentClassifier | None = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
stdout=None,
|
||||
style_func=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
replace: bool = False,
|
||||
use_first: bool = True,
|
||||
dry_run: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> DocumentType | None:
|
||||
"""
|
||||
Assign a document type to a document based on classifier results.
|
||||
|
||||
Args:
|
||||
document: The document to classify.
|
||||
logging_group: Optional logging group for structured log output.
|
||||
classifier: The trained classifier. If None, only rule-based matching runs.
|
||||
replace: If True, overwrite an existing document type assignment.
|
||||
use_first: If True, pick the first match when multiple types match.
|
||||
If False, skip assignment when multiple match.
|
||||
dry_run: If True, compute and return the selection without saving.
|
||||
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
|
||||
|
||||
Returns:
|
||||
The document type that was (or would be) assigned, or None if no match
|
||||
was found or assignment was skipped.
|
||||
"""
|
||||
if document.document_type and not replace:
|
||||
return
|
||||
return None
|
||||
|
||||
potential_document_type = matching.match_document_types(document, classifier)
|
||||
|
||||
potential_count = len(potential_document_type)
|
||||
selected = potential_document_type[0] if potential_document_type else None
|
||||
potential_document_types = matching.match_document_types(document, classifier)
|
||||
potential_count = len(potential_document_types)
|
||||
selected = potential_document_types[0] if potential_document_types else None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
@@ -192,42 +193,64 @@ def set_document_type(
|
||||
f"not assigning any document type",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
return
|
||||
return None
|
||||
|
||||
if selected or replace:
|
||||
if suggest:
|
||||
_suggestion_printer(
|
||||
stdout,
|
||||
style_func,
|
||||
"document type",
|
||||
document,
|
||||
selected,
|
||||
base_url,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Assigning document type {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
if (selected or replace) and not dry_run:
|
||||
logger.info(
|
||||
f"Assigning document type {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
document.document_type = selected
|
||||
document.save(update_fields=("document_type",))
|
||||
|
||||
document.document_type = selected
|
||||
document.save(update_fields=("document_type",))
|
||||
return selected
|
||||
|
||||
|
||||
def set_tags(
|
||||
sender,
|
||||
sender: object,
|
||||
document: Document,
|
||||
*,
|
||||
logging_group=None,
|
||||
logging_group: object = None,
|
||||
classifier: DocumentClassifier | None = None,
|
||||
replace=False,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
stdout=None,
|
||||
style_func=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
replace: bool = False,
|
||||
dry_run: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> tuple[set[Tag], set[Tag]]:
|
||||
"""
|
||||
Assign tags to a document based on classifier results.
|
||||
|
||||
When replace=True, existing auto-matched and rule-matched tags are removed
|
||||
before applying the new set (inbox tags and manually-added tags are preserved).
|
||||
|
||||
Args:
|
||||
document: The document to classify.
|
||||
logging_group: Optional logging group for structured log output.
|
||||
classifier: The trained classifier. If None, only rule-based matching runs.
|
||||
replace: If True, remove existing classifier-managed tags before applying
|
||||
new ones. Inbox tags and manually-added tags are always preserved.
|
||||
dry_run: If True, compute what would change without saving anything.
|
||||
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
|
||||
|
||||
Returns:
|
||||
A two-tuple of (tags_added, tags_removed). In non-replace mode,
|
||||
tags_removed is always an empty set. In dry_run mode, neither set
|
||||
is applied to the database.
|
||||
"""
|
||||
# Compute which tags would be removed under replace mode.
|
||||
# The filter mirrors the .delete() call below: keep inbox tags and
|
||||
# manually-added tags (match="" and not auto-matched).
|
||||
if replace:
|
||||
tags_to_remove: set[Tag] = set(
|
||||
document.tags.exclude(
|
||||
is_inbox_tag=True,
|
||||
).exclude(
|
||||
Q(match="") & ~Q(matching_algorithm=Tag.MATCH_AUTO),
|
||||
),
|
||||
)
|
||||
else:
|
||||
tags_to_remove = set()
|
||||
|
||||
if replace and not dry_run:
|
||||
Document.tags.through.objects.filter(document=document).exclude(
|
||||
Q(tag__is_inbox_tag=True),
|
||||
).exclude(
|
||||
@@ -235,65 +258,53 @@ def set_tags(
|
||||
).delete()
|
||||
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
matched_tags = matching.match_tags(document, classifier)
|
||||
tags_to_add = set(matched_tags) - current_tags
|
||||
|
||||
relevant_tags = set(matched_tags) - current_tags
|
||||
|
||||
if suggest:
|
||||
extra_tags = current_tags - set(matched_tags)
|
||||
extra_tags = [
|
||||
t for t in extra_tags if t.matching_algorithm == MatchingModel.MATCH_AUTO
|
||||
]
|
||||
if not relevant_tags and not extra_tags:
|
||||
return
|
||||
doc_str = style_func.SUCCESS(str(document))
|
||||
if base_url:
|
||||
stdout.write(doc_str)
|
||||
stdout.write(f"{base_url}/documents/{document.pk}")
|
||||
else:
|
||||
stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]"))
|
||||
if relevant_tags:
|
||||
stdout.write("Suggest tags: " + ", ".join([t.name for t in relevant_tags]))
|
||||
if extra_tags:
|
||||
stdout.write("Extra tags: " + ", ".join([t.name for t in extra_tags]))
|
||||
else:
|
||||
if not relevant_tags:
|
||||
return
|
||||
|
||||
message = 'Tagging "{}" with "{}"'
|
||||
if tags_to_add and not dry_run:
|
||||
logger.info(
|
||||
message.format(document, ", ".join([t.name for t in relevant_tags])),
|
||||
f'Tagging "{document}" with "{", ".join(t.name for t in tags_to_add)}"',
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
document.add_nested_tags(tags_to_add)
|
||||
|
||||
document.add_nested_tags(relevant_tags)
|
||||
return tags_to_add, tags_to_remove
|
||||
|
||||
|
||||
def set_storage_path(
|
||||
sender,
|
||||
sender: object,
|
||||
document: Document,
|
||||
*,
|
||||
logging_group=None,
|
||||
logging_group: object = None,
|
||||
classifier: DocumentClassifier | None = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
stdout=None,
|
||||
style_func=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
replace: bool = False,
|
||||
use_first: bool = True,
|
||||
dry_run: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> StoragePath | None:
|
||||
"""
|
||||
Assign a storage path to a document based on classifier results.
|
||||
|
||||
Args:
|
||||
document: The document to classify.
|
||||
logging_group: Optional logging group for structured log output.
|
||||
classifier: The trained classifier. If None, only rule-based matching runs.
|
||||
replace: If True, overwrite an existing storage path assignment.
|
||||
use_first: If True, pick the first match when multiple paths match.
|
||||
If False, skip assignment when multiple match.
|
||||
dry_run: If True, compute and return the selection without saving.
|
||||
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
|
||||
|
||||
Returns:
|
||||
The storage path that was (or would be) assigned, or None if no match
|
||||
was found or assignment was skipped.
|
||||
"""
|
||||
if document.storage_path and not replace:
|
||||
return
|
||||
return None
|
||||
|
||||
potential_storage_path = matching.match_storage_paths(
|
||||
document,
|
||||
classifier,
|
||||
)
|
||||
|
||||
potential_count = len(potential_storage_path)
|
||||
selected = potential_storage_path[0] if potential_storage_path else None
|
||||
potential_storage_paths = matching.match_storage_paths(document, classifier)
|
||||
potential_count = len(potential_storage_paths)
|
||||
selected = potential_storage_paths[0] if potential_storage_paths else None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
@@ -308,26 +319,17 @@ def set_storage_path(
|
||||
f"not assigning any storage directory",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
return
|
||||
return None
|
||||
|
||||
if selected or replace:
|
||||
if suggest:
|
||||
_suggestion_printer(
|
||||
stdout,
|
||||
style_func,
|
||||
"storage directory",
|
||||
document,
|
||||
selected,
|
||||
base_url,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Assigning storage path {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
if (selected or replace) and not dry_run:
|
||||
logger.info(
|
||||
f"Assigning storage path {selected} to {document}",
|
||||
extra={"group": logging_group},
|
||||
)
|
||||
document.storage_path = selected
|
||||
document.save(update_fields=("storage_path",))
|
||||
|
||||
document.storage_path = selected
|
||||
document.save(update_fields=("storage_path",))
|
||||
return selected
|
||||
|
||||
|
||||
# see empty_trash in documents/tasks.py for signal handling
|
||||
@@ -596,16 +598,6 @@ def update_filename_and_move_files(
|
||||
root=settings.ARCHIVE_DIR,
|
||||
)
|
||||
|
||||
# Keep version files in sync with root
|
||||
if instance.root_document_id is None:
|
||||
for version_doc in Document.objects.filter(root_document_id=instance.pk).only(
|
||||
"pk",
|
||||
):
|
||||
update_filename_and_move_files(
|
||||
Document,
|
||||
version_doc,
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
def process_cf_select_update(custom_field: CustomField) -> None:
|
||||
|
||||
@@ -4,13 +4,11 @@ import logging
|
||||
import shutil
|
||||
import uuid
|
||||
import zipfile
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from tempfile import mkstemp
|
||||
from typing import TypeVar
|
||||
|
||||
import tqdm
|
||||
from celery import Task
|
||||
from celery import shared_task
|
||||
from celery import states
|
||||
@@ -68,19 +66,11 @@ from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
logger = logging.getLogger("paperless.tasks")
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
return iterable
|
||||
|
||||
|
||||
@shared_task
|
||||
def index_optimize() -> None:
|
||||
ix = index.open_index()
|
||||
@@ -88,13 +78,13 @@ def index_optimize() -> None:
|
||||
writer.commit(optimize=True)
|
||||
|
||||
|
||||
def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
|
||||
def index_reindex(*, progress_bar_disable=False) -> None:
|
||||
documents = Document.objects.all()
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in iter_wrapper(documents):
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
index.update_document(writer, document)
|
||||
|
||||
|
||||
@@ -237,30 +227,20 @@ def consume_file(
|
||||
@shared_task
|
||||
def sanity_check(*, scheduled=True, raise_on_error=True):
|
||||
messages = sanity_checker.check_sanity(scheduled=scheduled)
|
||||
|
||||
messages.log_messages()
|
||||
|
||||
if not messages.has_error and not messages.has_warning and not messages.has_info:
|
||||
return "No issues detected."
|
||||
|
||||
parts: list[str] = []
|
||||
if messages.document_error_count:
|
||||
parts.append(f"{messages.document_error_count} document(s) with errors")
|
||||
if messages.document_warning_count:
|
||||
parts.append(f"{messages.document_warning_count} document(s) with warnings")
|
||||
if messages.document_info_count:
|
||||
parts.append(f"{messages.document_info_count} document(s) with infos")
|
||||
if messages.global_warning_count:
|
||||
parts.append(f"{messages.global_warning_count} global warning(s)")
|
||||
|
||||
summary = ", ".join(parts) + " found."
|
||||
|
||||
if messages.has_error:
|
||||
message = summary + " Check logs for details."
|
||||
message = "Sanity check exited with errors. See log."
|
||||
if raise_on_error:
|
||||
raise SanityCheckFailedException(message)
|
||||
return message
|
||||
|
||||
return summary
|
||||
elif messages.has_warning:
|
||||
return "Sanity check exited with warnings. See log."
|
||||
elif len(messages) > 0:
|
||||
return "Sanity check exited with infos. See log."
|
||||
else:
|
||||
return "No issues detected."
|
||||
|
||||
|
||||
@shared_task
|
||||
@@ -285,6 +265,7 @@ def bulk_update_documents(document_ids) -> None:
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
update_llm_index(
|
||||
progress_bar_disable=True,
|
||||
rebuild=False,
|
||||
)
|
||||
|
||||
@@ -625,7 +606,7 @@ def update_document_parent_tags(tag: Tag, new_parent: Tag) -> None:
|
||||
@shared_task
|
||||
def llmindex_index(
|
||||
*,
|
||||
iter_wrapper: IterWrapper[Document] = _identity,
|
||||
progress_bar_disable=True,
|
||||
rebuild=False,
|
||||
scheduled=True,
|
||||
auto=False,
|
||||
@@ -648,7 +629,7 @@ def llmindex_index(
|
||||
|
||||
try:
|
||||
result = update_llm_index(
|
||||
iter_wrapper=iter_wrapper,
|
||||
progress_bar_disable=progress_bar_disable,
|
||||
rebuild=rebuild,
|
||||
)
|
||||
task.status = states.SUCCESS
|
||||
|
||||
@@ -1,96 +1,10 @@
|
||||
import shutil
|
||||
import zoneinfo
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import filelock
|
||||
import pytest
|
||||
from django.contrib.auth import get_user_model
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from rest_framework.test import APIClient
|
||||
|
||||
from documents.tests.factories import DocumentFactory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class PaperlessDirs:
|
||||
"""Standard Paperless-ngx directory layout for tests."""
|
||||
|
||||
media: Path
|
||||
originals: Path
|
||||
archive: Path
|
||||
thumbnails: Path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def samples_dir() -> Path:
|
||||
"""Path to the shared test sample documents."""
|
||||
return Path(__file__).parent / "samples" / "documents"
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def paperless_dirs(tmp_path: Path) -> PaperlessDirs:
|
||||
"""Create and return the directory structure for testing."""
|
||||
media = tmp_path / "media"
|
||||
dirs = PaperlessDirs(
|
||||
media=media,
|
||||
originals=media / "documents" / "originals",
|
||||
archive=media / "documents" / "archive",
|
||||
thumbnails=media / "documents" / "thumbnails",
|
||||
)
|
||||
for d in (dirs.originals, dirs.archive, dirs.thumbnails):
|
||||
d.mkdir(parents=True)
|
||||
return dirs
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def _media_settings(paperless_dirs: PaperlessDirs, settings) -> None:
|
||||
"""Configure Django settings to point at temp directories."""
|
||||
settings.MEDIA_ROOT = paperless_dirs.media
|
||||
settings.ORIGINALS_DIR = paperless_dirs.originals
|
||||
settings.ARCHIVE_DIR = paperless_dirs.archive
|
||||
settings.THUMBNAIL_DIR = paperless_dirs.thumbnails
|
||||
settings.MEDIA_LOCK = paperless_dirs.media / "media.lock"
|
||||
settings.IGNORABLE_FILES = {".DS_Store", "Thumbs.db", "desktop.ini"}
|
||||
settings.APP_LOGO = ""
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def sample_doc(
|
||||
paperless_dirs: PaperlessDirs,
|
||||
_media_settings: None,
|
||||
samples_dir: Path,
|
||||
) -> "Document":
|
||||
"""Create a document with valid files and matching checksums."""
|
||||
with filelock.FileLock(paperless_dirs.media / "media.lock"):
|
||||
shutil.copy(
|
||||
samples_dir / "originals" / "0000001.pdf",
|
||||
paperless_dirs.originals / "0000001.pdf",
|
||||
)
|
||||
shutil.copy(
|
||||
samples_dir / "archive" / "0000001.pdf",
|
||||
paperless_dirs.archive / "0000001.pdf",
|
||||
)
|
||||
shutil.copy(
|
||||
samples_dir / "thumbnails" / "0000001.webp",
|
||||
paperless_dirs.thumbnails / "0000001.webp",
|
||||
)
|
||||
|
||||
return DocumentFactory(
|
||||
title="test",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
content="test content",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
archive_filename="0000001.pdf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo:
|
||||
@@ -114,3 +28,14 @@ def authenticated_rest_api_client(rest_api_client: APIClient):
|
||||
user = UserModel.objects.create_user(username="testuser", password="password")
|
||||
rest_api_client.force_authenticate(user=user)
|
||||
yield rest_api_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def faker_session_locale():
|
||||
"""Set Faker locale for reproducibility."""
|
||||
return "en_US"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def faker_seed():
|
||||
return 12345
|
||||
|
||||
@@ -1,17 +1,67 @@
|
||||
from factory import Faker
|
||||
"""
|
||||
Factory-boy factories for documents app models.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import factory
|
||||
from factory.django import DjangoModelFactory
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
|
||||
|
||||
class CorrespondentFactory(DjangoModelFactory):
|
||||
class Meta:
|
||||
model = Correspondent
|
||||
|
||||
name = Faker("name")
|
||||
name = factory.Sequence(lambda n: f"{factory.Faker('company')} {n}")
|
||||
match = ""
|
||||
matching_algorithm = MatchingModel.MATCH_NONE
|
||||
|
||||
|
||||
class DocumentTypeFactory(DjangoModelFactory):
|
||||
class Meta:
|
||||
model = DocumentType
|
||||
|
||||
name = factory.Sequence(lambda n: f"{factory.Faker('bs')} {n}")
|
||||
match = ""
|
||||
matching_algorithm = MatchingModel.MATCH_NONE
|
||||
|
||||
|
||||
class TagFactory(DjangoModelFactory):
|
||||
class Meta:
|
||||
model = Tag
|
||||
|
||||
name = factory.Sequence(lambda n: f"{factory.Faker('word')} {n}")
|
||||
match = ""
|
||||
matching_algorithm = MatchingModel.MATCH_NONE
|
||||
is_inbox_tag = False
|
||||
|
||||
|
||||
class StoragePathFactory(DjangoModelFactory):
|
||||
class Meta:
|
||||
model = StoragePath
|
||||
|
||||
name = factory.Sequence(
|
||||
lambda n: f"{factory.Faker('file_path', depth=2, extension='')} {n}",
|
||||
)
|
||||
path = factory.LazyAttribute(lambda o: f"{o.name}/{{title}}")
|
||||
match = ""
|
||||
matching_algorithm = MatchingModel.MATCH_NONE
|
||||
|
||||
|
||||
class DocumentFactory(DjangoModelFactory):
|
||||
class Meta:
|
||||
model = Document
|
||||
|
||||
title = factory.Faker("sentence", nb_words=4)
|
||||
checksum = factory.Faker("md5")
|
||||
content = factory.Faker("paragraph")
|
||||
correspondent = None
|
||||
document_type = None
|
||||
storage_path = None
|
||||
|
||||
@@ -1,193 +0,0 @@
|
||||
"""Tests for the document_sanity_checker management command.
|
||||
|
||||
Verifies Rich rendering (table, panel, summary) and end-to-end CLI behavior.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
from rich.console import Console
|
||||
|
||||
from documents.management.commands.document_sanity_checker import Command
|
||||
from documents.sanity_checker import SanityCheckMessages
|
||||
from documents.tests.factories import DocumentFactory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from documents.models import Document
|
||||
from documents.tests.conftest import PaperlessDirs
|
||||
|
||||
|
||||
def _render_to_string(messages: SanityCheckMessages) -> str:
|
||||
"""Render command output to a plain string for assertion."""
|
||||
buf = StringIO()
|
||||
cmd = Command()
|
||||
cmd.console = Console(file=buf, width=120, no_color=True)
|
||||
cmd._render_results(messages)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rich rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRenderResultsNoIssues:
|
||||
"""No DB access needed -- renders an empty SanityCheckMessages."""
|
||||
|
||||
def test_shows_panel(self) -> None:
|
||||
output = _render_to_string(SanityCheckMessages())
|
||||
assert "No issues detected" in output
|
||||
assert "Sanity Check" in output
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestRenderResultsWithIssues:
|
||||
def test_error_row(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(sample_doc.pk, "Original missing")
|
||||
output = _render_to_string(msgs)
|
||||
assert "Sanity Check Results" in output
|
||||
assert "ERROR" in output
|
||||
assert "Original missing" in output
|
||||
assert f"#{sample_doc.pk}" in output
|
||||
assert sample_doc.title in output
|
||||
|
||||
def test_warning_row(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.warning(sample_doc.pk, "Suspicious file")
|
||||
output = _render_to_string(msgs)
|
||||
assert "WARN" in output
|
||||
assert "Suspicious file" in output
|
||||
|
||||
def test_info_row(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.info(sample_doc.pk, "No OCR data")
|
||||
output = _render_to_string(msgs)
|
||||
assert "INFO" in output
|
||||
assert "No OCR data" in output
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_global_message(self) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.warning(None, "Orphaned file: /tmp/stray.pdf")
|
||||
output = _render_to_string(msgs)
|
||||
assert "(global)" in output
|
||||
assert "Orphaned file" in output
|
||||
|
||||
def test_multiple_messages_same_doc(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(sample_doc.pk, "Thumbnail missing")
|
||||
msgs.error(sample_doc.pk, "Checksum mismatch")
|
||||
output = _render_to_string(msgs)
|
||||
assert "Thumbnail missing" in output
|
||||
assert "Checksum mismatch" in output
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_unknown_doc_pk(self) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(99999, "Ghost document")
|
||||
output = _render_to_string(msgs)
|
||||
assert "#99999" in output
|
||||
assert "Unknown" in output
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestRenderResultsSummary:
|
||||
def test_errors_only(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(sample_doc.pk, "broken")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 document(s) with" in output
|
||||
assert "errors" in output
|
||||
|
||||
def test_warnings_only(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.warning(sample_doc.pk, "odd")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 document(s) with" in output
|
||||
assert "warnings" in output
|
||||
|
||||
def test_infos_only(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.info(sample_doc.pk, "no OCR")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 document(s) with infos" in output
|
||||
|
||||
def test_empty_messages(self) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
output = _render_to_string(msgs)
|
||||
assert "No issues detected." in output
|
||||
|
||||
def test_document_errors_and_global_warnings(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(sample_doc.pk, "broken")
|
||||
msgs.warning(None, "orphan")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 document(s) with" in output
|
||||
assert "errors" in output
|
||||
assert "1 global warning(s)" in output
|
||||
assert "2 document(s)" not in output
|
||||
|
||||
def test_global_warnings_only(self) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.warning(None, "extra file")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 global warning(s)" in output
|
||||
assert "document(s) with" not in output
|
||||
|
||||
def test_all_levels_combined(self, sample_doc: Document) -> None:
|
||||
msgs = SanityCheckMessages()
|
||||
msgs.error(sample_doc.pk, "broken")
|
||||
msgs.warning(sample_doc.pk, "odd")
|
||||
msgs.info(sample_doc.pk, "fyi")
|
||||
msgs.warning(None, "extra file")
|
||||
output = _render_to_string(msgs)
|
||||
assert "1 document(s) with errors" in output
|
||||
assert "1 document(s) with warnings" in output
|
||||
assert "1 document(s) with infos" in output
|
||||
assert "1 global warning(s)" in output
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end command execution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.management
|
||||
class TestDocumentSanityCheckerCommand:
|
||||
def test_no_issues(self, sample_doc: Document) -> None:
|
||||
out = StringIO()
|
||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
||||
assert "No issues detected" in out.getvalue()
|
||||
|
||||
def test_missing_original(self, sample_doc: Document) -> None:
|
||||
Path(sample_doc.source_path).unlink()
|
||||
out = StringIO()
|
||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
||||
output = out.getvalue()
|
||||
assert "ERROR" in output
|
||||
assert "Original of document does not exist" in output
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_checksum_mismatch(self, paperless_dirs: PaperlessDirs) -> None:
|
||||
"""Lightweight document with zero-byte files triggers checksum mismatch."""
|
||||
doc = DocumentFactory(
|
||||
title="test",
|
||||
content="test",
|
||||
filename="test.pdf",
|
||||
checksum="abc",
|
||||
)
|
||||
Path(doc.source_path).touch()
|
||||
Path(doc.thumbnail_path).touch()
|
||||
|
||||
out = StringIO()
|
||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
||||
output = out.getvalue()
|
||||
assert "ERROR" in output
|
||||
assert "Checksum mismatch. Stored: abc, actual:" in output
|
||||
@@ -699,14 +699,6 @@ class TestConsumer(
|
||||
self.assertIsNotNone(root_doc)
|
||||
assert root_doc is not None
|
||||
|
||||
root_storage_path = StoragePath.objects.create(
|
||||
name="version-root-path",
|
||||
path="root/{{title}}",
|
||||
)
|
||||
root_doc.storage_path = root_storage_path
|
||||
root_doc.archive_serial_number = 42
|
||||
root_doc.save()
|
||||
|
||||
actor = User.objects.create_user(
|
||||
username="actor",
|
||||
email="actor@example.com",
|
||||
@@ -743,7 +735,7 @@ class TestConsumer(
|
||||
)
|
||||
consumer.setup()
|
||||
try:
|
||||
self.assertEqual(consumer.filename, version_file.name)
|
||||
self.assertTrue(consumer.filename.endswith("_v0.pdf"))
|
||||
consumer.run()
|
||||
finally:
|
||||
consumer.cleanup()
|
||||
@@ -753,10 +745,8 @@ class TestConsumer(
|
||||
version = versions.first()
|
||||
assert version is not None
|
||||
assert version.original_filename is not None
|
||||
self.assertEqual(version.version_index, 1)
|
||||
self.assertEqual(version.version_label, "v2")
|
||||
self.assertIsNone(version.archive_serial_number)
|
||||
self.assertEqual(version.original_filename, version_file.name)
|
||||
self.assertTrue(version.original_filename.endswith("_v0.pdf"))
|
||||
self.assertTrue(bool(version.content))
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@@ -805,7 +795,7 @@ class TestConsumer(
|
||||
)
|
||||
consumer.setup()
|
||||
try:
|
||||
self.assertEqual(consumer.filename, "valid_pdf_version-upload")
|
||||
self.assertEqual(consumer.filename, "valid_pdf_version-upload_v0")
|
||||
consumer.run()
|
||||
finally:
|
||||
consumer.cleanup()
|
||||
@@ -815,67 +805,9 @@ class TestConsumer(
|
||||
)
|
||||
self.assertIsNotNone(version)
|
||||
assert version is not None
|
||||
self.assertEqual(version.version_index, 1)
|
||||
self.assertEqual(version.original_filename, "valid_pdf_version-upload")
|
||||
self.assertEqual(version.original_filename, "valid_pdf_version-upload_v0")
|
||||
self.assertTrue(bool(version.content))
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def test_consume_version_index_monotonic_after_version_deletion(self, m) -> None:
|
||||
m.return_value = MagicMock()
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
root_doc = Document.objects.first()
|
||||
self.assertIsNotNone(root_doc)
|
||||
assert root_doc is not None
|
||||
|
||||
def consume_version(version_file: Path) -> Document:
|
||||
status = DummyProgressManager(version_file.name, None)
|
||||
overrides = DocumentMetadataOverrides()
|
||||
doc = ConsumableDocument(
|
||||
DocumentSource.ApiUpload,
|
||||
original_file=version_file,
|
||||
root_document_id=root_doc.pk,
|
||||
)
|
||||
preflight = ConsumerPreflightPlugin(
|
||||
doc,
|
||||
overrides,
|
||||
status, # type: ignore[arg-type]
|
||||
self.dirs.scratch_dir,
|
||||
"task-id",
|
||||
)
|
||||
preflight.setup()
|
||||
preflight.run()
|
||||
|
||||
consumer = ConsumerPlugin(
|
||||
doc,
|
||||
overrides,
|
||||
status, # type: ignore[arg-type]
|
||||
self.dirs.scratch_dir,
|
||||
"task-id",
|
||||
)
|
||||
consumer.setup()
|
||||
try:
|
||||
consumer.run()
|
||||
finally:
|
||||
consumer.cleanup()
|
||||
|
||||
version = (
|
||||
Document.objects.filter(root_document=root_doc).order_by("-id").first()
|
||||
)
|
||||
assert version is not None
|
||||
return version
|
||||
|
||||
v1 = consume_version(self.get_test_file2())
|
||||
self.assertEqual(v1.version_index, 1)
|
||||
v1.delete()
|
||||
|
||||
# The next version should have version_index 2, even though version_index 1 was deleted
|
||||
v2 = consume_version(self.get_test_file())
|
||||
self.assertEqual(v2.version_index, 2)
|
||||
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def testClassifyDocument(self, m) -> None:
|
||||
correspondent = Correspondent.objects.create(
|
||||
|
||||
@@ -77,58 +77,6 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
settings.ORIGINALS_DIR / "test" / "test.pdf",
|
||||
)
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None)
|
||||
def test_root_storage_path_change_updates_version_files(self) -> None:
|
||||
old_storage_path = StoragePath.objects.create(
|
||||
name="old-path",
|
||||
path="old/{{title}}",
|
||||
)
|
||||
new_storage_path = StoragePath.objects.create(
|
||||
name="new-path",
|
||||
path="new/{{title}}",
|
||||
)
|
||||
|
||||
root_doc = Document.objects.create(
|
||||
title="rootdoc",
|
||||
mime_type="application/pdf",
|
||||
checksum="root-checksum",
|
||||
storage_path=old_storage_path,
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="version-title",
|
||||
mime_type="application/pdf",
|
||||
checksum="version-checksum",
|
||||
root_document=root_doc,
|
||||
version_index=1,
|
||||
)
|
||||
|
||||
Document.objects.filter(pk=root_doc.pk).update(
|
||||
filename=generate_filename(root_doc),
|
||||
)
|
||||
Document.objects.filter(pk=version_doc.pk).update(
|
||||
filename=generate_filename(version_doc),
|
||||
)
|
||||
root_doc.refresh_from_db()
|
||||
version_doc.refresh_from_db()
|
||||
|
||||
create_source_path_directory(root_doc.source_path)
|
||||
Path(root_doc.source_path).touch()
|
||||
create_source_path_directory(version_doc.source_path)
|
||||
Path(version_doc.source_path).touch()
|
||||
|
||||
root_doc.storage_path = new_storage_path
|
||||
root_doc.save()
|
||||
|
||||
root_doc.refresh_from_db()
|
||||
version_doc.refresh_from_db()
|
||||
|
||||
self.assertEqual(root_doc.filename, "new/rootdoc.pdf")
|
||||
self.assertEqual(version_doc.filename, "new/rootdoc_v1.pdf")
|
||||
self.assertIsFile(root_doc.source_path)
|
||||
self.assertIsFile(version_doc.source_path)
|
||||
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc.pdf")
|
||||
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc_v1.pdf")
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self) -> None:
|
||||
document = Document()
|
||||
@@ -1274,94 +1222,6 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
|
||||
Path("logs.pdf"),
|
||||
)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
def test_version_index_suffix_for_template_filename(self) -> None:
|
||||
root_doc = Document.objects.create(
|
||||
title="the_doc",
|
||||
mime_type="application/pdf",
|
||||
checksum="root-checksum",
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="the_doc",
|
||||
mime_type="application/pdf",
|
||||
checksum="version-checksum",
|
||||
root_document=root_doc,
|
||||
version_index=1,
|
||||
)
|
||||
|
||||
self.assertEqual(generate_filename(version_doc), Path("the_doc_v1.pdf"))
|
||||
self.assertEqual(
|
||||
generate_filename(version_doc, counter=1),
|
||||
Path("the_doc_v1_01.pdf"),
|
||||
)
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None)
|
||||
def test_version_index_suffix_for_default_filename(self) -> None:
|
||||
root_doc = Document.objects.create(
|
||||
title="root",
|
||||
mime_type="text/plain",
|
||||
checksum="root-checksum",
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="root",
|
||||
mime_type="text/plain",
|
||||
checksum="version-checksum",
|
||||
root_document=root_doc,
|
||||
version_index=2,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
generate_filename(version_doc),
|
||||
Path(f"{root_doc.pk:07d}_v2.txt"),
|
||||
)
|
||||
self.assertEqual(
|
||||
generate_filename(version_doc, archive_filename=True),
|
||||
Path(f"{root_doc.pk:07d}_v2.pdf"),
|
||||
)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{original_name}")
|
||||
def test_version_index_suffix_with_original_name_placeholder(self) -> None:
|
||||
root_doc = Document.objects.create(
|
||||
title="root",
|
||||
mime_type="application/pdf",
|
||||
checksum="root-checksum",
|
||||
original_filename="root-upload.pdf",
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="root",
|
||||
mime_type="application/pdf",
|
||||
checksum="version-checksum",
|
||||
root_document=root_doc,
|
||||
version_index=1,
|
||||
original_filename="version-upload.pdf",
|
||||
)
|
||||
|
||||
self.assertEqual(generate_filename(version_doc), Path("root-upload_v1.pdf"))
|
||||
|
||||
def test_version_index_suffix_with_storage_path(self) -> None:
|
||||
storage_path = StoragePath.objects.create(
|
||||
name="vtest",
|
||||
path="folder/{{title}}",
|
||||
)
|
||||
root_doc = Document.objects.create(
|
||||
title="storage_doc",
|
||||
mime_type="application/pdf",
|
||||
checksum="root-checksum",
|
||||
storage_path=storage_path,
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="version_title_should_not_be_used",
|
||||
mime_type="application/pdf",
|
||||
checksum="version-checksum",
|
||||
root_document=root_doc,
|
||||
version_index=3,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
generate_filename(version_doc),
|
||||
Path("folder/storage_doc_v3.pdf"),
|
||||
)
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="XX{correspondent}/{title}",
|
||||
FILENAME_FORMAT_REMOVE_NONE=True,
|
||||
|
||||
@@ -134,7 +134,6 @@ class TestRenamer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertIsFile(doc2.archive_path)
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestCreateClassifier(TestCase):
|
||||
@mock.patch(
|
||||
"documents.management.commands.document_create_classifier.train_classifier",
|
||||
@@ -145,6 +144,32 @@ class TestCreateClassifier(TestCase):
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestSanityChecker(DirectoriesMixin, TestCase):
|
||||
def test_no_issues(self) -> None:
|
||||
with self.assertLogs() as capture:
|
||||
call_command("document_sanity_checker")
|
||||
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertIn("Sanity checker detected no issues.", capture.output[0])
|
||||
|
||||
def test_errors(self) -> None:
|
||||
doc = Document.objects.create(
|
||||
title="test",
|
||||
content="test",
|
||||
filename="test.pdf",
|
||||
checksum="abc",
|
||||
)
|
||||
Path(doc.source_path).touch()
|
||||
Path(doc.thumbnail_path).touch()
|
||||
|
||||
with self.assertLogs() as capture:
|
||||
call_command("document_sanity_checker")
|
||||
|
||||
self.assertEqual(len(capture.output), 2)
|
||||
self.assertIn("Checksum mismatch. Stored: abc, actual:", capture.output[1])
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestConvertMariaDBUUID(TestCase):
|
||||
@mock.patch("django.db.connection.schema_editor")
|
||||
|
||||
@@ -288,7 +288,7 @@ class TestExportImport(
|
||||
self.assertEqual(Permission.objects.count(), num_permission_objects)
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(messages.total_issue_count, 0)
|
||||
self.assertEqual(len(messages), 0)
|
||||
|
||||
def test_exporter_with_filename_format(self) -> None:
|
||||
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
||||
|
||||
@@ -1,298 +1,442 @@
|
||||
"""
|
||||
Tests for the document_retagger management command.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.tests.factories import CorrespondentFactory
|
||||
from documents.tests.factories import DocumentFactory
|
||||
from documents.tests.factories import DocumentTypeFactory
|
||||
from documents.tests.factories import StoragePathFactory
|
||||
from documents.tests.factories import TagFactory
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level type aliases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
StoragePathTuple = tuple[StoragePath, StoragePath, StoragePath]
|
||||
TagTuple = tuple[Tag, Tag, Tag, Tag, Tag]
|
||||
CorrespondentTuple = tuple[Correspondent, Correspondent]
|
||||
DocumentTypeTuple = tuple[DocumentType, DocumentType]
|
||||
DocumentTuple = tuple[Document, Document, Document, Document]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def storage_paths(db) -> StoragePathTuple:
|
||||
"""Three storage paths with varying match rules."""
|
||||
sp1 = StoragePathFactory(
|
||||
path="{created_data}/{title}",
|
||||
match="auto document",
|
||||
matching_algorithm=MatchingModel.MATCH_LITERAL,
|
||||
)
|
||||
sp2 = StoragePathFactory(
|
||||
path="{title}",
|
||||
match="^first|^unrelated",
|
||||
matching_algorithm=MatchingModel.MATCH_REGEX,
|
||||
)
|
||||
sp3 = StoragePathFactory(
|
||||
path="{title}",
|
||||
match="^blah",
|
||||
matching_algorithm=MatchingModel.MATCH_REGEX,
|
||||
)
|
||||
return sp1, sp2, sp3
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tags(db) -> TagTuple:
|
||||
"""Tags covering the common matching scenarios."""
|
||||
tag_first = TagFactory(match="first", matching_algorithm=Tag.MATCH_ANY)
|
||||
tag_second = TagFactory(match="second", matching_algorithm=Tag.MATCH_ANY)
|
||||
tag_inbox = TagFactory(is_inbox_tag=True)
|
||||
tag_no_match = TagFactory()
|
||||
tag_auto = TagFactory(matching_algorithm=Tag.MATCH_AUTO)
|
||||
return tag_first, tag_second, tag_inbox, tag_no_match, tag_auto
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def correspondents(db) -> CorrespondentTuple:
|
||||
"""Two correspondents matching 'first' and 'second' content."""
|
||||
c_first = CorrespondentFactory(
|
||||
match="first",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
c_second = CorrespondentFactory(
|
||||
match="second",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
return c_first, c_second
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def document_types(db) -> DocumentTypeTuple:
|
||||
"""Two document types matching 'first' and 'second' content."""
|
||||
dt_first = DocumentTypeFactory(
|
||||
match="first",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
dt_second = DocumentTypeFactory(
|
||||
match="second",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
return dt_first, dt_second
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def documents(storage_paths: StoragePathTuple, tags: TagTuple) -> DocumentTuple:
|
||||
"""Four documents with varied content used across most retagger tests."""
|
||||
_, _, sp3 = storage_paths
|
||||
_, _, tag_inbox, tag_no_match, tag_auto = tags
|
||||
|
||||
d1 = DocumentFactory(checksum="A", title="A", content="first document")
|
||||
d2 = DocumentFactory(checksum="B", title="B", content="second document")
|
||||
d3 = DocumentFactory(
|
||||
checksum="C",
|
||||
title="C",
|
||||
content="unrelated document",
|
||||
storage_path=sp3,
|
||||
)
|
||||
d4 = DocumentFactory(checksum="D", title="D", content="auto document")
|
||||
|
||||
d3.tags.add(tag_inbox, tag_no_match)
|
||||
d4.tags.add(tag_auto)
|
||||
|
||||
return d1, d2, d3, d4
|
||||
|
||||
|
||||
def _get_docs() -> DocumentTuple:
|
||||
return (
|
||||
Document.objects.get(title="A"),
|
||||
Document.objects.get(title="B"),
|
||||
Document.objects.get(title="C"),
|
||||
Document.objects.get(title="D"),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tag assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestRetagger(DirectoriesMixin, TestCase):
|
||||
def make_models(self) -> None:
|
||||
self.sp1 = StoragePath.objects.create(
|
||||
name="dummy a",
|
||||
path="{created_data}/{title}",
|
||||
match="auto document",
|
||||
matching_algorithm=StoragePath.MATCH_LITERAL,
|
||||
)
|
||||
self.sp2 = StoragePath.objects.create(
|
||||
name="dummy b",
|
||||
path="{title}",
|
||||
match="^first|^unrelated",
|
||||
matching_algorithm=StoragePath.MATCH_REGEX,
|
||||
)
|
||||
|
||||
self.sp3 = StoragePath.objects.create(
|
||||
name="dummy c",
|
||||
path="{title}",
|
||||
match="^blah",
|
||||
matching_algorithm=StoragePath.MATCH_REGEX,
|
||||
)
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
checksum="A",
|
||||
title="A",
|
||||
content="first document",
|
||||
)
|
||||
self.d2 = Document.objects.create(
|
||||
checksum="B",
|
||||
title="B",
|
||||
content="second document",
|
||||
)
|
||||
self.d3 = Document.objects.create(
|
||||
checksum="C",
|
||||
title="C",
|
||||
content="unrelated document",
|
||||
storage_path=self.sp3,
|
||||
)
|
||||
self.d4 = Document.objects.create(
|
||||
checksum="D",
|
||||
title="D",
|
||||
content="auto document",
|
||||
)
|
||||
|
||||
self.tag_first = Tag.objects.create(
|
||||
name="tag1",
|
||||
match="first",
|
||||
matching_algorithm=Tag.MATCH_ANY,
|
||||
)
|
||||
self.tag_second = Tag.objects.create(
|
||||
name="tag2",
|
||||
match="second",
|
||||
matching_algorithm=Tag.MATCH_ANY,
|
||||
)
|
||||
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
|
||||
self.tag_no_match = Tag.objects.create(name="test2")
|
||||
self.tag_auto = Tag.objects.create(
|
||||
name="tagauto",
|
||||
matching_algorithm=Tag.MATCH_AUTO,
|
||||
)
|
||||
|
||||
self.d3.tags.add(self.tag_inbox)
|
||||
self.d3.tags.add(self.tag_no_match)
|
||||
self.d4.tags.add(self.tag_auto)
|
||||
|
||||
self.correspondent_first = Correspondent.objects.create(
|
||||
name="c1",
|
||||
match="first",
|
||||
matching_algorithm=Correspondent.MATCH_ANY,
|
||||
)
|
||||
self.correspondent_second = Correspondent.objects.create(
|
||||
name="c2",
|
||||
match="second",
|
||||
matching_algorithm=Correspondent.MATCH_ANY,
|
||||
)
|
||||
|
||||
self.doctype_first = DocumentType.objects.create(
|
||||
name="dt1",
|
||||
match="first",
|
||||
matching_algorithm=DocumentType.MATCH_ANY,
|
||||
)
|
||||
self.doctype_second = DocumentType.objects.create(
|
||||
name="dt2",
|
||||
match="second",
|
||||
matching_algorithm=DocumentType.MATCH_ANY,
|
||||
)
|
||||
|
||||
def get_updated_docs(self):
|
||||
return (
|
||||
Document.objects.get(title="A"),
|
||||
Document.objects.get(title="B"),
|
||||
Document.objects.get(title="C"),
|
||||
Document.objects.get(title="D"),
|
||||
)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
self.make_models()
|
||||
|
||||
def test_add_tags(self) -> None:
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerTags(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_add_tags(self, tags: TagTuple) -> None:
|
||||
tag_first, tag_second, *_ = tags
|
||||
call_command("document_retagger", "--tags")
|
||||
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
|
||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 1)
|
||||
self.assertEqual(d_second.tags.count(), 1)
|
||||
self.assertEqual(d_unrelated.tags.count(), 2)
|
||||
self.assertEqual(d_auto.tags.count(), 1)
|
||||
assert d_first.tags.count() == 1
|
||||
assert d_second.tags.count() == 1
|
||||
assert d_unrelated.tags.count() == 2
|
||||
assert d_auto.tags.count() == 1
|
||||
assert d_first.tags.first() == tag_first
|
||||
assert d_second.tags.first() == tag_second
|
||||
|
||||
self.assertEqual(d_first.tags.first(), self.tag_first)
|
||||
self.assertEqual(d_second.tags.first(), self.tag_second)
|
||||
|
||||
def test_add_type(self) -> None:
|
||||
call_command("document_retagger", "--document_type")
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.document_type, self.doctype_first)
|
||||
self.assertEqual(d_second.document_type, self.doctype_second)
|
||||
|
||||
def test_add_correspondent(self) -> None:
|
||||
call_command("document_retagger", "--correspondent")
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.correspondent, self.correspondent_first)
|
||||
self.assertEqual(d_second.correspondent, self.correspondent_second)
|
||||
|
||||
def test_overwrite_preserve_inbox(self) -> None:
|
||||
self.d1.tags.add(self.tag_second)
|
||||
def test_overwrite_removes_stale_tags_and_preserves_inbox(
|
||||
self,
|
||||
documents: DocumentTuple,
|
||||
tags: TagTuple,
|
||||
) -> None:
|
||||
d1, *_ = documents
|
||||
tag_first, tag_second, tag_inbox, tag_no_match, _ = tags
|
||||
d1.tags.add(tag_second)
|
||||
|
||||
call_command("document_retagger", "--tags", "--overwrite")
|
||||
|
||||
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
|
||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||
|
||||
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
|
||||
assert Tag.objects.filter(id=tag_second.id).exists()
|
||||
assert list(d_first.tags.values_list("id", flat=True)) == [tag_first.id]
|
||||
assert list(d_second.tags.values_list("id", flat=True)) == [tag_second.id]
|
||||
assert set(d_unrelated.tags.values_list("id", flat=True)) == {
|
||||
tag_inbox.id,
|
||||
tag_no_match.id,
|
||||
}
|
||||
assert d_auto.tags.count() == 0
|
||||
|
||||
self.assertCountEqual(
|
||||
[tag.id for tag in d_first.tags.all()],
|
||||
[self.tag_first.id],
|
||||
@pytest.mark.usefixtures("documents")
|
||||
@pytest.mark.parametrize(
|
||||
"extra_args",
|
||||
[
|
||||
pytest.param([], id="no_base_url"),
|
||||
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
|
||||
],
|
||||
)
|
||||
def test_suggest_does_not_apply_tags(self, extra_args: list[str]) -> None:
|
||||
call_command("document_retagger", "--tags", "--suggest", *extra_args)
|
||||
d_first, d_second, _, d_auto = _get_docs()
|
||||
|
||||
assert d_first.tags.count() == 0
|
||||
assert d_second.tags.count() == 0
|
||||
assert d_auto.tags.count() == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document type assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerDocumentType(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_add_type(self, document_types: DocumentTypeTuple) -> None:
|
||||
dt_first, dt_second = document_types
|
||||
call_command("document_retagger", "--document_type")
|
||||
d_first, d_second, _, _ = _get_docs()
|
||||
|
||||
assert d_first.document_type == dt_first
|
||||
assert d_second.document_type == dt_second
|
||||
|
||||
@pytest.mark.usefixtures("documents", "document_types")
|
||||
@pytest.mark.parametrize(
|
||||
"extra_args",
|
||||
[
|
||||
pytest.param([], id="no_base_url"),
|
||||
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
|
||||
],
|
||||
)
|
||||
def test_suggest_does_not_apply_document_type(self, extra_args: list[str]) -> None:
|
||||
call_command("document_retagger", "--document_type", "--suggest", *extra_args)
|
||||
d_first, d_second, _, _ = _get_docs()
|
||||
|
||||
assert d_first.document_type is None
|
||||
assert d_second.document_type is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("use_first_flag", "expects_assignment"),
|
||||
[
|
||||
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
|
||||
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
|
||||
],
|
||||
)
|
||||
def test_use_first_with_multiple_matches(
|
||||
self,
|
||||
use_first_flag: list[str],
|
||||
*,
|
||||
expects_assignment: bool,
|
||||
) -> None:
|
||||
DocumentTypeFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
self.assertCountEqual(
|
||||
[tag.id for tag in d_second.tags.all()],
|
||||
[self.tag_second.id],
|
||||
DocumentTypeFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
self.assertCountEqual(
|
||||
[tag.id for tag in d_unrelated.tags.all()],
|
||||
[self.tag_inbox.id, self.tag_no_match.id],
|
||||
doc = DocumentFactory(content="ambiguous content")
|
||||
|
||||
call_command("document_retagger", "--document_type", *use_first_flag)
|
||||
|
||||
doc.refresh_from_db()
|
||||
assert (doc.document_type is not None) is expects_assignment
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Correspondent assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerCorrespondent(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_add_correspondent(self, correspondents: CorrespondentTuple) -> None:
|
||||
c_first, c_second = correspondents
|
||||
call_command("document_retagger", "--correspondent")
|
||||
d_first, d_second, _, _ = _get_docs()
|
||||
|
||||
assert d_first.correspondent == c_first
|
||||
assert d_second.correspondent == c_second
|
||||
|
||||
@pytest.mark.usefixtures("documents", "correspondents")
|
||||
@pytest.mark.parametrize(
|
||||
"extra_args",
|
||||
[
|
||||
pytest.param([], id="no_base_url"),
|
||||
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
|
||||
],
|
||||
)
|
||||
def test_suggest_does_not_apply_correspondent(self, extra_args: list[str]) -> None:
|
||||
call_command("document_retagger", "--correspondent", "--suggest", *extra_args)
|
||||
d_first, d_second, _, _ = _get_docs()
|
||||
|
||||
assert d_first.correspondent is None
|
||||
assert d_second.correspondent is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("use_first_flag", "expects_assignment"),
|
||||
[
|
||||
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
|
||||
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
|
||||
],
|
||||
)
|
||||
def test_use_first_with_multiple_matches(
|
||||
self,
|
||||
use_first_flag: list[str],
|
||||
*,
|
||||
expects_assignment: bool,
|
||||
) -> None:
|
||||
CorrespondentFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
self.assertEqual(d_auto.tags.count(), 0)
|
||||
|
||||
def test_add_tags_suggest(self) -> None:
|
||||
call_command("document_retagger", "--tags", "--suggest")
|
||||
d_first, d_second, _, d_auto = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 0)
|
||||
self.assertEqual(d_second.tags.count(), 0)
|
||||
self.assertEqual(d_auto.tags.count(), 1)
|
||||
|
||||
def test_add_type_suggest(self) -> None:
|
||||
call_command("document_retagger", "--document_type", "--suggest")
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
|
||||
self.assertIsNone(d_first.document_type)
|
||||
self.assertIsNone(d_second.document_type)
|
||||
|
||||
def test_add_correspondent_suggest(self) -> None:
|
||||
call_command("document_retagger", "--correspondent", "--suggest")
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
|
||||
self.assertIsNone(d_first.correspondent)
|
||||
self.assertIsNone(d_second.correspondent)
|
||||
|
||||
def test_add_tags_suggest_url(self) -> None:
|
||||
call_command(
|
||||
"document_retagger",
|
||||
"--tags",
|
||||
"--suggest",
|
||||
"--base-url=http://localhost",
|
||||
CorrespondentFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
d_first, d_second, _, d_auto = self.get_updated_docs()
|
||||
doc = DocumentFactory(content="ambiguous content")
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 0)
|
||||
self.assertEqual(d_second.tags.count(), 0)
|
||||
self.assertEqual(d_auto.tags.count(), 1)
|
||||
call_command("document_retagger", "--correspondent", *use_first_flag)
|
||||
|
||||
def test_add_type_suggest_url(self) -> None:
|
||||
call_command(
|
||||
"document_retagger",
|
||||
"--document_type",
|
||||
"--suggest",
|
||||
"--base-url=http://localhost",
|
||||
)
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
doc.refresh_from_db()
|
||||
assert (doc.correspondent is not None) is expects_assignment
|
||||
|
||||
self.assertIsNone(d_first.document_type)
|
||||
self.assertIsNone(d_second.document_type)
|
||||
|
||||
def test_add_correspondent_suggest_url(self) -> None:
|
||||
call_command(
|
||||
"document_retagger",
|
||||
"--correspondent",
|
||||
"--suggest",
|
||||
"--base-url=http://localhost",
|
||||
)
|
||||
d_first, d_second, _, _ = self.get_updated_docs()
|
||||
# ---------------------------------------------------------------------------
|
||||
# Storage path assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
self.assertIsNone(d_first.correspondent)
|
||||
self.assertIsNone(d_second.correspondent)
|
||||
|
||||
def test_add_storage_path(self) -> None:
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerStoragePath(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_add_storage_path(self, storage_paths: StoragePathTuple) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- 2 storage paths with documents which match them
|
||||
- 1 document which matches but has a storage path
|
||||
WHEN:
|
||||
- document retagger is called
|
||||
THEN:
|
||||
- Matching document's storage paths updated
|
||||
- Non-matching documents have no storage path
|
||||
- Existing storage patch left unchanged
|
||||
GIVEN documents matching various storage path rules
|
||||
WHEN document_retagger --storage_path is called
|
||||
THEN matching documents get the correct path; existing path is unchanged
|
||||
"""
|
||||
call_command(
|
||||
"document_retagger",
|
||||
"--storage_path",
|
||||
)
|
||||
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
|
||||
sp1, sp2, sp3 = storage_paths
|
||||
call_command("document_retagger", "--storage_path")
|
||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||
|
||||
self.assertEqual(d_first.storage_path, self.sp2)
|
||||
self.assertEqual(d_auto.storage_path, self.sp1)
|
||||
self.assertIsNone(d_second.storage_path)
|
||||
self.assertEqual(d_unrelated.storage_path, self.sp3)
|
||||
assert d_first.storage_path == sp2
|
||||
assert d_auto.storage_path == sp1
|
||||
assert d_second.storage_path is None
|
||||
assert d_unrelated.storage_path == sp3
|
||||
|
||||
def test_overwrite_storage_path(self) -> None:
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_overwrite_storage_path(self, storage_paths: StoragePathTuple) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- 2 storage paths with documents which match them
|
||||
- 1 document which matches but has a storage path
|
||||
WHEN:
|
||||
- document retagger is called with overwrite
|
||||
THEN:
|
||||
- Matching document's storage paths updated
|
||||
- Non-matching documents have no storage path
|
||||
- Existing storage patch overwritten
|
||||
GIVEN a document with an existing storage path that matches a different rule
|
||||
WHEN document_retagger --storage_path --overwrite is called
|
||||
THEN the existing path is replaced by the newly matched path
|
||||
"""
|
||||
sp1, sp2, _ = storage_paths
|
||||
call_command("document_retagger", "--storage_path", "--overwrite")
|
||||
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
|
||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||
|
||||
self.assertEqual(d_first.storage_path, self.sp2)
|
||||
self.assertEqual(d_auto.storage_path, self.sp1)
|
||||
self.assertIsNone(d_second.storage_path)
|
||||
self.assertEqual(d_unrelated.storage_path, self.sp2)
|
||||
assert d_first.storage_path == sp2
|
||||
assert d_auto.storage_path == sp1
|
||||
assert d_second.storage_path is None
|
||||
assert d_unrelated.storage_path == sp2
|
||||
|
||||
def test_id_range_parameter(self) -> None:
|
||||
commandOutput = ""
|
||||
Document.objects.create(
|
||||
checksum="E",
|
||||
title="E",
|
||||
content="NOT the first document",
|
||||
@pytest.mark.parametrize(
|
||||
("use_first_flag", "expects_assignment"),
|
||||
[
|
||||
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
|
||||
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
|
||||
],
|
||||
)
|
||||
def test_use_first_with_multiple_matches(
|
||||
self,
|
||||
use_first_flag: list[str],
|
||||
*,
|
||||
expects_assignment: bool,
|
||||
) -> None:
|
||||
StoragePathFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
call_command("document_retagger", "--tags", "--id-range", "1", "2")
|
||||
# The retagger shouldn`t apply the 'first' tag to our new document
|
||||
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 1)
|
||||
StoragePathFactory(
|
||||
match="ambiguous",
|
||||
matching_algorithm=MatchingModel.MATCH_ANY,
|
||||
)
|
||||
doc = DocumentFactory(content="ambiguous content")
|
||||
|
||||
try:
|
||||
commandOutput = call_command("document_retagger", "--tags", "--id-range")
|
||||
except CommandError:
|
||||
# Just ignore the error
|
||||
None
|
||||
self.assertIn(commandOutput, "Error: argument --id-range: expected 2 arguments")
|
||||
call_command("document_retagger", "--storage_path", *use_first_flag)
|
||||
|
||||
try:
|
||||
commandOutput = call_command(
|
||||
"document_retagger",
|
||||
"--tags",
|
||||
"--id-range",
|
||||
"a",
|
||||
"b",
|
||||
)
|
||||
except CommandError:
|
||||
# Just ignore the error
|
||||
None
|
||||
self.assertIn(commandOutput, "error: argument --id-range: invalid int value:")
|
||||
doc.refresh_from_db()
|
||||
assert (doc.storage_path is not None) is expects_assignment
|
||||
|
||||
call_command("document_retagger", "--tags", "--id-range", "1", "9999")
|
||||
# Now we should have 2 documents
|
||||
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 2)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ID range filtering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerIdRange(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
@pytest.mark.parametrize(
|
||||
("id_range_args", "expected_count"),
|
||||
[
|
||||
pytest.param(["1", "2"], 1, id="narrow_range_limits_scope"),
|
||||
pytest.param(["1", "9999"], 2, id="wide_range_tags_all_matches"),
|
||||
],
|
||||
)
|
||||
def test_id_range_limits_scope(
|
||||
self,
|
||||
tags: TagTuple,
|
||||
id_range_args: list[str],
|
||||
expected_count: int,
|
||||
) -> None:
|
||||
DocumentFactory(content="NOT the first document")
|
||||
call_command("document_retagger", "--tags", "--id-range", *id_range_args)
|
||||
tag_first, *_ = tags
|
||||
assert Document.objects.filter(tags__id=tag_first.id).count() == expected_count
|
||||
|
||||
@pytest.mark.usefixtures("documents")
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[
|
||||
pytest.param(["--tags", "--id-range"], id="missing_both_values"),
|
||||
pytest.param(["--tags", "--id-range", "a", "b"], id="non_integer_values"),
|
||||
],
|
||||
)
|
||||
def test_id_range_invalid_arguments_raise(self, args: list[str]) -> None:
|
||||
with pytest.raises((CommandError, SystemExit)):
|
||||
call_command("document_retagger", *args)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@pytest.mark.django_db
|
||||
class TestRetaggerEdgeCases(DirectoriesMixin):
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_no_targets_exits_cleanly(self) -> None:
|
||||
"""Calling the retagger with no classifier targets should not raise."""
|
||||
call_command("document_retagger")
|
||||
|
||||
@pytest.mark.usefixtures("documents")
|
||||
def test_inbox_only_skips_non_inbox_documents(self) -> None:
|
||||
"""--inbox-only must restrict processing to documents with an inbox tag."""
|
||||
call_command("document_retagger", "--tags", "--inbox-only")
|
||||
d_first, _, d_unrelated, _ = _get_docs()
|
||||
|
||||
assert d_first.tags.count() == 0
|
||||
assert d_unrelated.tags.count() == 2
|
||||
|
||||
@@ -1,295 +1,192 @@
|
||||
"""Tests for the sanity checker module.
|
||||
|
||||
Tests exercise ``check_sanity`` as a whole, verifying document validation,
|
||||
orphan detection, task recording, and the iter_wrapper contract.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import filelock
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.sanity_checker import check_sanity
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from documents.tests.conftest import PaperlessDirs
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityNoDocuments:
|
||||
"""Sanity checks against an empty archive."""
|
||||
class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
def make_test_data(self):
|
||||
with filelock.FileLock(settings.MEDIA_LOCK):
|
||||
# just make sure that the lockfile is present.
|
||||
shutil.copy(
|
||||
(
|
||||
Path(__file__).parent
|
||||
/ "samples"
|
||||
/ "documents"
|
||||
/ "originals"
|
||||
/ "0000001.pdf"
|
||||
),
|
||||
Path(self.dirs.originals_dir) / "0000001.pdf",
|
||||
)
|
||||
shutil.copy(
|
||||
(
|
||||
Path(__file__).parent
|
||||
/ "samples"
|
||||
/ "documents"
|
||||
/ "archive"
|
||||
/ "0000001.pdf"
|
||||
),
|
||||
Path(self.dirs.archive_dir) / "0000001.pdf",
|
||||
)
|
||||
shutil.copy(
|
||||
(
|
||||
Path(__file__).parent
|
||||
/ "samples"
|
||||
/ "documents"
|
||||
/ "thumbnails"
|
||||
/ "0000001.webp"
|
||||
),
|
||||
Path(self.dirs.thumbnail_dir) / "0000001.webp",
|
||||
)
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_no_documents(self) -> None:
|
||||
return Document.objects.create(
|
||||
title="test",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
content="test",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
archive_filename="0000001.pdf",
|
||||
)
|
||||
|
||||
def assertSanityError(self, doc: Document, messageRegex) -> None:
|
||||
messages = check_sanity()
|
||||
assert not messages.has_error
|
||||
assert not messages.has_warning
|
||||
assert messages.total_issue_count == 0
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_no_issues_logs_clean(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||
messages = check_sanity()
|
||||
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
|
||||
self.assertTrue(messages.has_error)
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
assert "Sanity checker detected no issues." in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityHealthyDocument:
|
||||
def test_no_errors(self, sample_doc: Document) -> None:
|
||||
messages = check_sanity()
|
||||
assert not messages.has_error
|
||||
assert not messages.has_warning
|
||||
assert messages.total_issue_count == 0
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityThumbnail:
|
||||
def test_missing(self, sample_doc: Document) -> None:
|
||||
Path(sample_doc.thumbnail_path).unlink()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Thumbnail of document does not exist" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
)
|
||||
|
||||
def test_unreadable(self, sample_doc: Document) -> None:
|
||||
thumb = Path(sample_doc.thumbnail_path)
|
||||
thumb.chmod(0o000)
|
||||
try:
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Cannot read thumbnail" in m["message"] for m in messages[sample_doc.pk]
|
||||
self.assertEqual(
|
||||
capture.records[0].message,
|
||||
f"Detected following issue(s) with document #{doc.pk}, titled {doc.title}",
|
||||
)
|
||||
finally:
|
||||
thumb.chmod(0o644)
|
||||
self.assertRegex(capture.records[1].message, messageRegex)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityOriginal:
|
||||
def test_missing(self, sample_doc: Document) -> None:
|
||||
Path(sample_doc.source_path).unlink()
|
||||
def test_no_issues(self) -> None:
|
||||
self.make_test_data()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Original of document does not exist" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
)
|
||||
|
||||
def test_checksum_mismatch(self, sample_doc: Document) -> None:
|
||||
sample_doc.checksum = "badhash"
|
||||
sample_doc.save()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Checksum mismatch" in m["message"] and "badhash" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
)
|
||||
|
||||
def test_unreadable(self, sample_doc: Document) -> None:
|
||||
src = Path(sample_doc.source_path)
|
||||
src.chmod(0o000)
|
||||
try:
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Cannot read original" in m["message"] for m in messages[sample_doc.pk]
|
||||
self.assertFalse(messages.has_error)
|
||||
self.assertFalse(messages.has_warning)
|
||||
with self.assertLogs() as capture:
|
||||
messages.log_messages()
|
||||
self.assertEqual(len(capture.output), 1)
|
||||
self.assertEqual(capture.records[0].levelno, logging.INFO)
|
||||
self.assertEqual(
|
||||
capture.records[0].message,
|
||||
"Sanity checker detected no issues.",
|
||||
)
|
||||
finally:
|
||||
src.chmod(0o644)
|
||||
|
||||
def test_no_docs(self) -> None:
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityArchive:
|
||||
def test_checksum_without_filename(self, sample_doc: Document) -> None:
|
||||
sample_doc.archive_filename = None
|
||||
sample_doc.save()
|
||||
def test_success(self) -> None:
|
||||
self.make_test_data()
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
def test_no_thumbnail(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.thumbnail_path).unlink()
|
||||
self.assertSanityError(doc, "Thumbnail of document does not exist")
|
||||
|
||||
def test_thumbnail_no_access(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.thumbnail_path).chmod(0o000)
|
||||
self.assertSanityError(doc, "Cannot read thumbnail file of document")
|
||||
Path(doc.thumbnail_path).chmod(0o777)
|
||||
|
||||
def test_no_original(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.source_path).unlink()
|
||||
self.assertSanityError(doc, "Original of document does not exist.")
|
||||
|
||||
def test_original_no_access(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.source_path).chmod(0o000)
|
||||
self.assertSanityError(doc, "Cannot read original file of document")
|
||||
Path(doc.source_path).chmod(0o777)
|
||||
|
||||
def test_original_checksum_mismatch(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
doc.checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertSanityError(doc, "Checksum mismatch. Stored: WOW, actual: ")
|
||||
|
||||
def test_no_archive(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.archive_path).unlink()
|
||||
self.assertSanityError(doc, "Archived version of document does not exist.")
|
||||
|
||||
def test_archive_no_access(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
Path(doc.archive_path).chmod(0o000)
|
||||
self.assertSanityError(doc, "Cannot read archive file of document")
|
||||
Path(doc.archive_path).chmod(0o777)
|
||||
|
||||
def test_archive_checksum_mismatch(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertSanityError(doc, "Checksum mismatch of archived document")
|
||||
|
||||
def test_empty_content(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
doc.content = ""
|
||||
doc.save()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"checksum, but no archive filename" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
self.assertFalse(messages.has_error)
|
||||
self.assertFalse(messages.has_warning)
|
||||
self.assertEqual(len(messages), 1)
|
||||
self.assertRegex(
|
||||
messages[doc.pk][0]["message"],
|
||||
"Document contains no OCR data",
|
||||
)
|
||||
|
||||
def test_filename_without_checksum(self, sample_doc: Document) -> None:
|
||||
sample_doc.archive_checksum = None
|
||||
sample_doc.save()
|
||||
def test_orphaned_file(self) -> None:
|
||||
self.make_test_data()
|
||||
Path(self.dirs.originals_dir, "orphaned").touch()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"checksum is missing" in m["message"] for m in messages[sample_doc.pk]
|
||||
self.assertTrue(messages.has_warning)
|
||||
self.assertRegex(
|
||||
messages._messages[None][0]["message"],
|
||||
"Orphaned file in media dir",
|
||||
)
|
||||
|
||||
def test_missing_file(self, sample_doc: Document) -> None:
|
||||
Path(sample_doc.archive_path).unlink()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Archived version of document does not exist" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
)
|
||||
|
||||
def test_checksum_mismatch(self, sample_doc: Document) -> None:
|
||||
sample_doc.archive_checksum = "wronghash"
|
||||
sample_doc.save()
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Checksum mismatch of archived document" in m["message"]
|
||||
for m in messages[sample_doc.pk]
|
||||
)
|
||||
|
||||
def test_unreadable(self, sample_doc: Document) -> None:
|
||||
archive = Path(sample_doc.archive_path)
|
||||
archive.chmod(0o000)
|
||||
try:
|
||||
messages = check_sanity()
|
||||
assert messages.has_error
|
||||
assert any(
|
||||
"Cannot read archive" in m["message"] for m in messages[sample_doc.pk]
|
||||
)
|
||||
finally:
|
||||
archive.chmod(0o644)
|
||||
|
||||
def test_no_archive_at_all(self, sample_doc: Document) -> None:
|
||||
"""Document with neither archive checksum nor filename is valid."""
|
||||
Path(sample_doc.archive_path).unlink()
|
||||
sample_doc.archive_checksum = None
|
||||
sample_doc.archive_filename = None
|
||||
sample_doc.save()
|
||||
messages = check_sanity()
|
||||
assert not messages.has_error
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityContent:
|
||||
@pytest.mark.parametrize(
|
||||
"content",
|
||||
[
|
||||
pytest.param("", id="empty-string"),
|
||||
],
|
||||
@override_settings(
|
||||
APP_LOGO="logo/logo.png",
|
||||
)
|
||||
def test_no_content(self, sample_doc: Document, content: str) -> None:
|
||||
sample_doc.content = content
|
||||
sample_doc.save()
|
||||
def test_ignore_logo(self) -> None:
|
||||
self.make_test_data()
|
||||
logo_dir = Path(self.dirs.media_dir, "logo")
|
||||
logo_dir.mkdir(parents=True, exist_ok=True)
|
||||
Path(self.dirs.media_dir, "logo", "logo.png").touch()
|
||||
messages = check_sanity()
|
||||
assert not messages.has_error
|
||||
assert not messages.has_warning
|
||||
assert any("no OCR data" in m["message"] for m in messages[sample_doc.pk])
|
||||
self.assertFalse(messages.has_warning)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityOrphans:
|
||||
def test_orphaned_file(
|
||||
self,
|
||||
sample_doc: Document,
|
||||
paperless_dirs: PaperlessDirs,
|
||||
) -> None:
|
||||
(paperless_dirs.originals / "orphan.pdf").touch()
|
||||
def test_ignore_ignorable_files(self) -> None:
|
||||
self.make_test_data()
|
||||
Path(self.dirs.media_dir, ".DS_Store").touch()
|
||||
Path(self.dirs.media_dir, "desktop.ini").touch()
|
||||
messages = check_sanity()
|
||||
assert messages.has_warning
|
||||
assert any("Orphaned file" in m["message"] for m in messages[None])
|
||||
self.assertFalse(messages.has_warning)
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_ignorable_files_not_flagged(
|
||||
self,
|
||||
paperless_dirs: PaperlessDirs,
|
||||
) -> None:
|
||||
(paperless_dirs.media / ".DS_Store").touch()
|
||||
(paperless_dirs.media / "desktop.ini").touch()
|
||||
messages = check_sanity()
|
||||
assert not messages.has_warning
|
||||
def test_archive_filename_no_checksum(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = None
|
||||
doc.save()
|
||||
self.assertSanityError(doc, "has an archive file, but its checksum is missing.")
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityIterWrapper:
|
||||
def test_wrapper_receives_documents(self, sample_doc: Document) -> None:
|
||||
seen: list[Document] = []
|
||||
|
||||
def tracking(iterable: Iterable[Document]) -> Iterable[Document]:
|
||||
for item in iterable:
|
||||
seen.append(item)
|
||||
yield item
|
||||
|
||||
check_sanity(iter_wrapper=tracking)
|
||||
assert len(seen) == 1
|
||||
assert seen[0].pk == sample_doc.pk
|
||||
|
||||
def test_default_works_without_wrapper(self, sample_doc: Document) -> None:
|
||||
messages = check_sanity()
|
||||
assert not messages.has_error
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityTaskRecording:
|
||||
@pytest.mark.parametrize(
|
||||
("expected_type", "scheduled"),
|
||||
[
|
||||
pytest.param(PaperlessTask.TaskType.SCHEDULED_TASK, True, id="scheduled"),
|
||||
pytest.param(PaperlessTask.TaskType.MANUAL_TASK, False, id="manual"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_task_type(self, expected_type: str, *, scheduled: bool) -> None:
|
||||
check_sanity(scheduled=scheduled)
|
||||
task = PaperlessTask.objects.latest("date_created")
|
||||
assert task.task_name == PaperlessTask.TaskName.CHECK_SANITY
|
||||
assert task.type == expected_type
|
||||
|
||||
def test_success_status(self, sample_doc: Document) -> None:
|
||||
check_sanity()
|
||||
task = PaperlessTask.objects.latest("date_created")
|
||||
assert task.status == "SUCCESS"
|
||||
|
||||
def test_failure_status(self, sample_doc: Document) -> None:
|
||||
Path(sample_doc.source_path).unlink()
|
||||
check_sanity()
|
||||
task = PaperlessTask.objects.latest("date_created")
|
||||
assert task.status == "FAILURE"
|
||||
assert "Check logs for details" in task.result
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCheckSanityLogMessages:
|
||||
def test_logs_doc_issues(
|
||||
self,
|
||||
sample_doc: Document,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
Path(sample_doc.source_path).unlink()
|
||||
messages = check_sanity()
|
||||
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
|
||||
messages.log_messages()
|
||||
assert f"document #{sample_doc.pk}" in caplog.text
|
||||
assert "Original of document does not exist" in caplog.text
|
||||
|
||||
def test_logs_global_issues(
|
||||
self,
|
||||
sample_doc: Document,
|
||||
paperless_dirs: PaperlessDirs,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
(paperless_dirs.originals / "orphan.pdf").touch()
|
||||
messages = check_sanity()
|
||||
with caplog.at_level(logging.WARNING, logger="paperless.sanity_checker"):
|
||||
messages.log_messages()
|
||||
assert "Orphaned file" in caplog.text
|
||||
|
||||
@pytest.mark.usefixtures("_media_settings")
|
||||
def test_logs_unknown_doc_pk(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||
"""A doc PK not in the DB logs 'Unknown' as the title."""
|
||||
messages = check_sanity()
|
||||
messages.error(99999, "Ghost document")
|
||||
with caplog.at_level(logging.INFO, logger="paperless.sanity_checker"):
|
||||
messages.log_messages()
|
||||
assert "#99999" in caplog.text
|
||||
assert "Unknown" in caplog.text
|
||||
def test_archive_checksum_no_filename(self) -> None:
|
||||
doc = self.make_test_data()
|
||||
doc.archive_filename = None
|
||||
doc.save()
|
||||
self.assertSanityError(
|
||||
doc,
|
||||
"has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
|
||||
@@ -3,7 +3,6 @@ from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
@@ -106,83 +105,55 @@ class TestClassifier(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertNotEqual(mtime2, mtime3)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestSanityCheck:
|
||||
@pytest.fixture
|
||||
def mock_check_sanity(self, mocker) -> mock.MagicMock:
|
||||
return mocker.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_success(self, m) -> None:
|
||||
m.return_value = SanityCheckMessages()
|
||||
self.assertEqual(tasks.sanity_check(), "No issues detected.")
|
||||
m.assert_called_once()
|
||||
|
||||
def test_sanity_check_success(self, mock_check_sanity: mock.MagicMock) -> None:
|
||||
mock_check_sanity.return_value = SanityCheckMessages()
|
||||
assert tasks.sanity_check() == "No issues detected."
|
||||
mock_check_sanity.assert_called_once()
|
||||
|
||||
def test_sanity_check_error_raises(
|
||||
self,
|
||||
mock_check_sanity: mock.MagicMock,
|
||||
sample_doc: Document,
|
||||
) -> None:
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_error(self, m) -> None:
|
||||
messages = SanityCheckMessages()
|
||||
messages.error(sample_doc.pk, "some error")
|
||||
mock_check_sanity.return_value = messages
|
||||
with pytest.raises(SanityCheckFailedException):
|
||||
tasks.sanity_check()
|
||||
mock_check_sanity.assert_called_once()
|
||||
messages.error(None, "Some error")
|
||||
m.return_value = messages
|
||||
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
|
||||
m.assert_called_once()
|
||||
|
||||
def test_sanity_check_error_no_raise(
|
||||
self,
|
||||
mock_check_sanity: mock.MagicMock,
|
||||
sample_doc: Document,
|
||||
) -> None:
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_error_no_raise(self, m) -> None:
|
||||
messages = SanityCheckMessages()
|
||||
messages.error(sample_doc.pk, "some error")
|
||||
mock_check_sanity.return_value = messages
|
||||
messages.error(None, "Some error")
|
||||
m.return_value = messages
|
||||
# No exception should be raised
|
||||
result = tasks.sanity_check(raise_on_error=False)
|
||||
assert "1 document(s) with errors" in result
|
||||
assert "Check logs for details." in result
|
||||
mock_check_sanity.assert_called_once()
|
||||
self.assertEqual(
|
||||
result,
|
||||
"Sanity check exited with errors. See log.",
|
||||
)
|
||||
m.assert_called_once()
|
||||
|
||||
def test_sanity_check_warning_only(
|
||||
self,
|
||||
mock_check_sanity: mock.MagicMock,
|
||||
) -> None:
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_warning(self, m) -> None:
|
||||
messages = SanityCheckMessages()
|
||||
messages.warning(None, "extra file")
|
||||
mock_check_sanity.return_value = messages
|
||||
result = tasks.sanity_check()
|
||||
assert result == "1 global warning(s) found."
|
||||
mock_check_sanity.assert_called_once()
|
||||
messages.warning(None, "Some warning")
|
||||
m.return_value = messages
|
||||
self.assertEqual(
|
||||
tasks.sanity_check(),
|
||||
"Sanity check exited with warnings. See log.",
|
||||
)
|
||||
m.assert_called_once()
|
||||
|
||||
def test_sanity_check_info_only(
|
||||
self,
|
||||
mock_check_sanity: mock.MagicMock,
|
||||
sample_doc: Document,
|
||||
) -> None:
|
||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||
def test_sanity_check_info(self, m) -> None:
|
||||
messages = SanityCheckMessages()
|
||||
messages.info(sample_doc.pk, "some info")
|
||||
mock_check_sanity.return_value = messages
|
||||
result = tasks.sanity_check()
|
||||
assert result == "1 document(s) with infos found."
|
||||
mock_check_sanity.assert_called_once()
|
||||
|
||||
def test_sanity_check_errors_warnings_and_infos(
|
||||
self,
|
||||
mock_check_sanity: mock.MagicMock,
|
||||
sample_doc: Document,
|
||||
) -> None:
|
||||
messages = SanityCheckMessages()
|
||||
messages.error(sample_doc.pk, "broken")
|
||||
messages.warning(sample_doc.pk, "odd")
|
||||
messages.info(sample_doc.pk, "fyi")
|
||||
messages.warning(None, "extra file")
|
||||
mock_check_sanity.return_value = messages
|
||||
result = tasks.sanity_check(raise_on_error=False)
|
||||
assert "1 document(s) with errors" in result
|
||||
assert "1 document(s) with warnings" in result
|
||||
assert "1 document(s) with infos" in result
|
||||
assert "1 global warning(s)" in result
|
||||
assert "Check logs for details." in result
|
||||
mock_check_sanity.assert_called_once()
|
||||
messages.info(None, "Some info")
|
||||
m.return_value = messages
|
||||
self.assertEqual(
|
||||
tasks.sanity_check(),
|
||||
"Sanity check exited with infos. See log.",
|
||||
)
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
class TestBulkUpdate(DirectoriesMixin, TestCase):
|
||||
|
||||
@@ -378,6 +378,7 @@ class ApplicationConfigurationViewSet(ModelViewSet):
|
||||
):
|
||||
# AI index was just enabled and vector store file does not exist
|
||||
llmindex_index.delay(
|
||||
progress_bar_disable=True,
|
||||
rebuild=True,
|
||||
scheduled=False,
|
||||
auto=True,
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import logging
|
||||
import shutil
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterable
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from typing import TypeVar
|
||||
|
||||
import faiss
|
||||
import llama_index.core.settings as llama_settings
|
||||
import tqdm
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
@@ -31,14 +29,6 @@ from paperless_ai.embedding import build_llm_index_text
|
||||
from paperless_ai.embedding import get_embedding_dim
|
||||
from paperless_ai.embedding import get_embedding_model
|
||||
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
def _identity(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
return iterable
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless_ai.indexing")
|
||||
|
||||
|
||||
@@ -166,11 +156,7 @@ def vector_store_file_exists():
|
||||
return Path(settings.LLM_INDEX_DIR / "default__vector_store.json").exists()
|
||||
|
||||
|
||||
def update_llm_index(
|
||||
*,
|
||||
iter_wrapper: IterWrapper[Document] = _identity,
|
||||
rebuild=False,
|
||||
) -> str:
|
||||
def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
"""
|
||||
Rebuild or update the LLM index.
|
||||
"""
|
||||
@@ -190,7 +176,7 @@ def update_llm_index(
|
||||
embed_model = get_embedding_model()
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
storage_context = get_or_create_storage_context(rebuild=True)
|
||||
for document in iter_wrapper(documents):
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
|
||||
@@ -198,7 +184,7 @@ def update_llm_index(
|
||||
nodes=nodes,
|
||||
storage_context=storage_context,
|
||||
embed_model=embed_model,
|
||||
show_progress=False,
|
||||
show_progress=not progress_bar_disable,
|
||||
)
|
||||
msg = "LLM index rebuilt successfully."
|
||||
else:
|
||||
@@ -210,7 +196,7 @@ def update_llm_index(
|
||||
for node in index.docstore.get_nodes(all_node_ids)
|
||||
}
|
||||
|
||||
for document in iter_wrapper(documents):
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
|
||||
|
||||
16
uv.lock
generated
16
uv.lock
generated
@@ -1342,11 +1342,11 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "faker"
|
||||
version = "40.1.2"
|
||||
version = "40.5.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5e/77/1c3ff07b6739b9a1d23ca01ec0a90a309a33b78e345a3eb52f9ce9240e36/faker-40.1.2.tar.gz", hash = "sha256:b76a68163aa5f171d260fc24827a8349bc1db672f6a665359e8d0095e8135d30", size = 1949802, upload-time = "2026-01-13T20:51:49.917Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/03/2a/96fff3edcb10f6505143448a4b91535f77b74865cec45be52690ee280443/faker-40.5.1.tar.gz", hash = "sha256:70222361cd82aa10cb86066d1a4e8f47f2bcdc919615c412045a69c4e6da0cd3", size = 1952684, upload-time = "2026-02-23T21:34:38.362Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/46/ec/91a434c8a53d40c3598966621dea9c50512bec6ce8e76fa1751015e74cef/faker-40.1.2-py3-none-any.whl", hash = "sha256:93503165c165d330260e4379fd6dc07c94da90c611ed3191a0174d2ab9966a42", size = 1985633, upload-time = "2026-01-13T20:51:47.982Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4d/a9/1eed4db92d0aec2f9bfdf1faae0ab0418b5e121dda5701f118a7a4f0cd6a/faker-40.5.1-py3-none-any.whl", hash = "sha256:c69640c1e13bad49b4bcebcbf1b52f9f1a872b6ea186c248ada34d798f1661bf", size = 1987053, upload-time = "2026-02-23T21:34:36.418Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3121,6 +3121,7 @@ webserver = [
|
||||
dev = [
|
||||
{ name = "daphne", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "factory-boy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "faker", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "imagehash", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "prek", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -3145,6 +3146,7 @@ lint = [
|
||||
testing = [
|
||||
{ name = "daphne", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "factory-boy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "faker", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "imagehash", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest-cov", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -3257,6 +3259,7 @@ provides-extras = ["mariadb", "postgres", "webserver"]
|
||||
dev = [
|
||||
{ name = "daphne" },
|
||||
{ name = "factory-boy", specifier = "~=3.3.1" },
|
||||
{ name = "faker", specifier = "~=40.5.1" },
|
||||
{ name = "imagehash" },
|
||||
{ name = "prek", specifier = "~=0.3.0" },
|
||||
{ name = "pytest", specifier = "~=9.0.0" },
|
||||
@@ -3279,6 +3282,7 @@ lint = [
|
||||
testing = [
|
||||
{ name = "daphne" },
|
||||
{ name = "factory-boy", specifier = "~=3.3.1" },
|
||||
{ name = "faker", specifier = "~=40.5.1" },
|
||||
{ name = "imagehash" },
|
||||
{ name = "pytest", specifier = "~=9.0.0" },
|
||||
{ name = "pytest-cov", specifier = "~=7.0.0" },
|
||||
@@ -5906,11 +5910,11 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "whitenoise"
|
||||
version = "6.12.0"
|
||||
version = "6.11.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/cb/2a/55b3f3a4ec326cd077c1c3defeee656b9298372a69229134d930151acd01/whitenoise-6.12.0.tar.gz", hash = "sha256:f723ebb76a112e98816ff80fcea0a6c9b8ecde835f8ddda25df7a30a3c2db6ad", size = 26841, upload-time = "2026-02-27T00:05:42.028Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/15/95/8c81ec6b6ebcbf8aca2de7603070ccf37dbb873b03f20708e0f7c1664bc6/whitenoise-6.11.0.tar.gz", hash = "sha256:0f5bfce6061ae6611cd9396a8231e088722e4fc67bc13a111be74c738d99375f", size = 26432, upload-time = "2025-09-18T09:16:10.995Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/db/eb/d5583a11486211f3ebd4b385545ae787f32363d453c19fffd81106c9c138/whitenoise-6.12.0-py3-none-any.whl", hash = "sha256:fc5e8c572e33ebf24795b47b6a7da8da3c00cff2349f5b04c02f28d0cc5a3cc2", size = 20302, upload-time = "2026-02-27T00:05:40.086Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6c/e9/4366332f9295fe0647d7d3251ce18f5615fbcb12d02c79a26f8dba9221b3/whitenoise-6.11.0-py3-none-any.whl", hash = "sha256:b2aeb45950597236f53b5342b3121c5de69c8da0109362aee506ce88e022d258", size = 20197, upload-time = "2025-09-18T09:16:09.754Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
Reference in New Issue
Block a user