mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-08 17:18:54 +00:00
Compare commits
6 Commits
feature-ve
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dffbc4f98b | ||
|
|
b7a5255102 | ||
|
|
962a4ddd73 | ||
|
|
a5fe88d2a1 | ||
|
|
51c59746a7 | ||
|
|
c232d443fa |
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@@ -164,6 +164,8 @@ updates:
|
||||
directory: "/" # Location of package manifests
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
cooldown:
|
||||
default-days: 7
|
||||
groups:
|
||||
pre-commit-dependencies:
|
||||
patterns:
|
||||
|
||||
60
.github/workflows/ci-backend.yml
vendored
60
.github/workflows/ci-backend.yml
vendored
@@ -13,10 +13,13 @@ concurrency:
|
||||
env:
|
||||
DEFAULT_UV_VERSION: "0.10.x"
|
||||
NLTK_DATA: "/usr/share/nltk_data"
|
||||
permissions: {}
|
||||
jobs:
|
||||
changes:
|
||||
name: Detect Backend Changes
|
||||
runs-on: ubuntu-slim
|
||||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
backend_changed: ${{ steps.force.outputs.run_all == 'true' || steps.filter.outputs.backend == 'true' }}
|
||||
steps:
|
||||
@@ -27,10 +30,13 @@ jobs:
|
||||
persist-credentials: false
|
||||
- name: Decide run mode
|
||||
id: force
|
||||
env:
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then
|
||||
echo "run_all=true" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${{ github.event_name }}" == "push" && ( "${{ github.ref_name }}" == "main" || "${{ github.ref_name }}" == "dev" ) ]]; then
|
||||
elif [[ "${EVENT_NAME}" == "push" && ( "${REF_NAME}" == "main" || "${REF_NAME}" == "dev" ) ]]; then
|
||||
echo "run_all=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "run_all=false" >> "$GITHUB_OUTPUT"
|
||||
@@ -38,15 +44,22 @@ jobs:
|
||||
- name: Set diff range
|
||||
id: range
|
||||
if: steps.force.outputs.run_all != 'true'
|
||||
env:
|
||||
BEFORE_SHA: ${{ github.event.before }}
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
EVENT_CREATED: ${{ github.event.created }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
SHA: ${{ github.sha }}
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
||||
echo "base=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${{ github.event.created }}" == "true" ]]; then
|
||||
echo "base=${{ github.event.repository.default_branch }}" >> "$GITHUB_OUTPUT"
|
||||
if [[ "${EVENT_NAME}" == "pull_request" ]]; then
|
||||
echo "base=${PR_BASE_SHA}" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${EVENT_CREATED}" == "true" ]]; then
|
||||
echo "base=${DEFAULT_BRANCH}" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "base=${{ github.event.before }}" >> "$GITHUB_OUTPUT"
|
||||
echo "base=${BEFORE_SHA}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
echo "ref=${{ github.sha }}" >> "$GITHUB_OUTPUT"
|
||||
echo "ref=${SHA}" >> "$GITHUB_OUTPUT"
|
||||
- name: Detect changes
|
||||
id: filter
|
||||
if: steps.force.outputs.run_all != 'true'
|
||||
@@ -66,6 +79,8 @@ jobs:
|
||||
if: needs.changes.outputs.backend_changed == 'true'
|
||||
name: "Python ${{ matrix.python-version }}"
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11', '3.12', '3.13', '3.14']
|
||||
@@ -99,9 +114,11 @@ jobs:
|
||||
run: |
|
||||
sudo cp docker/rootfs/etc/ImageMagick-6/paperless-policy.xml /etc/ImageMagick-6/policy.xml
|
||||
- name: Install Python dependencies
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
uv sync \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
--group testing \
|
||||
--frozen
|
||||
- name: List installed Python dependencies
|
||||
@@ -109,14 +126,15 @@ jobs:
|
||||
uv pip list
|
||||
- name: Install NLTK data
|
||||
run: |
|
||||
uv run python -m nltk.downloader punkt punkt_tab snowball_data stopwords -d ${{ env.NLTK_DATA }}
|
||||
uv run python -m nltk.downloader punkt punkt_tab snowball_data stopwords -d "${NLTK_DATA}"
|
||||
- name: Run tests
|
||||
env:
|
||||
NLTK_DATA: ${{ env.NLTK_DATA }}
|
||||
PAPERLESS_CI_TEST: 1
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
uv run \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
--dev \
|
||||
--frozen \
|
||||
pytest
|
||||
@@ -143,6 +161,8 @@ jobs:
|
||||
if: needs.changes.outputs.backend_changed == 'true'
|
||||
name: Check project typing
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
env:
|
||||
DEFAULT_PYTHON: "3.12"
|
||||
steps:
|
||||
@@ -162,9 +182,11 @@ jobs:
|
||||
enable-cache: true
|
||||
python-version: ${{ steps.setup-python.outputs.python-version }}
|
||||
- name: Install Python dependencies
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
uv sync \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
--group testing \
|
||||
--group typing \
|
||||
--frozen
|
||||
@@ -200,19 +222,23 @@ jobs:
|
||||
runs-on: ubuntu-slim
|
||||
steps:
|
||||
- name: Check gate
|
||||
env:
|
||||
BACKEND_CHANGED: ${{ needs.changes.outputs.backend_changed }}
|
||||
TEST_RESULT: ${{ needs.test.result }}
|
||||
TYPING_RESULT: ${{ needs.typing.result }}
|
||||
run: |
|
||||
if [[ "${{ needs.changes.outputs.backend_changed }}" != "true" ]]; then
|
||||
if [[ "${BACKEND_CHANGED}" != "true" ]]; then
|
||||
echo "No backend-relevant changes detected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "${{ needs.test.result }}" != "success" ]]; then
|
||||
echo "::error::Backend test job result: ${{ needs.test.result }}"
|
||||
if [[ "${TEST_RESULT}" != "success" ]]; then
|
||||
echo "::error::Backend test job result: ${TEST_RESULT}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${{ needs.typing.result }}" != "success" ]]; then
|
||||
echo "::error::Backend typing job result: ${{ needs.typing.result }}"
|
||||
if [[ "${TYPING_RESULT}" != "success" ]]; then
|
||||
echo "::error::Backend typing job result: ${TYPING_RESULT}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
7
.github/workflows/ci-docker.yml
vendored
7
.github/workflows/ci-docker.yml
vendored
@@ -89,7 +89,7 @@ jobs:
|
||||
push_external="true"
|
||||
;;
|
||||
esac
|
||||
case "${{ github.ref }}" in
|
||||
case "${GITHUB_REF}" in
|
||||
refs/tags/v*|*beta.rc*)
|
||||
push_external="true"
|
||||
;;
|
||||
@@ -166,6 +166,7 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
needs: build-arch
|
||||
if: needs.build-arch.outputs.should-push == 'true'
|
||||
environment: image-publishing
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
@@ -230,8 +231,10 @@ jobs:
|
||||
|
||||
docker buildx imagetools create ${tags} ${digests}
|
||||
- name: Inspect image
|
||||
env:
|
||||
FIRST_TAG: ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
|
||||
docker buildx imagetools inspect "${FIRST_TAG}"
|
||||
- name: Copy to Docker Hub
|
||||
if: needs.build-arch.outputs.push-external == 'true'
|
||||
env:
|
||||
|
||||
5
.github/workflows/ci-docs.yml
vendored
5
.github/workflows/ci-docs.yml
vendored
@@ -10,8 +10,6 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
env:
|
||||
DEFAULT_UV_VERSION: "0.10.x"
|
||||
DEFAULT_PYTHON_VERSION: "3.12"
|
||||
@@ -105,6 +103,9 @@ jobs:
|
||||
needs: [changes, build]
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.changes.outputs.docs_changed == 'true'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
pages: write
|
||||
id-token: write
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
|
||||
41
.github/workflows/ci-frontend.yml
vendored
41
.github/workflows/ci-frontend.yml
vendored
@@ -10,10 +10,13 @@ on:
|
||||
concurrency:
|
||||
group: frontend-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
permissions: {}
|
||||
jobs:
|
||||
changes:
|
||||
name: Detect Frontend Changes
|
||||
runs-on: ubuntu-slim
|
||||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
frontend_changed: ${{ steps.force.outputs.run_all == 'true' || steps.filter.outputs.frontend == 'true' }}
|
||||
steps:
|
||||
@@ -21,12 +24,16 @@ jobs:
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
- name: Decide run mode
|
||||
id: force
|
||||
env:
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then
|
||||
echo "run_all=true" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${{ github.event_name }}" == "push" && ( "${{ github.ref_name }}" == "main" || "${{ github.ref_name }}" == "dev" ) ]]; then
|
||||
elif [[ "${EVENT_NAME}" == "push" && ( "${REF_NAME}" == "main" || "${REF_NAME}" == "dev" ) ]]; then
|
||||
echo "run_all=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "run_all=false" >> "$GITHUB_OUTPUT"
|
||||
@@ -34,15 +41,22 @@ jobs:
|
||||
- name: Set diff range
|
||||
id: range
|
||||
if: steps.force.outputs.run_all != 'true'
|
||||
env:
|
||||
BEFORE_SHA: ${{ github.event.before }}
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
EVENT_CREATED: ${{ github.event.created }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
SHA: ${{ github.sha }}
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
||||
echo "base=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${{ github.event.created }}" == "true" ]]; then
|
||||
echo "base=${{ github.event.repository.default_branch }}" >> "$GITHUB_OUTPUT"
|
||||
if [[ "${EVENT_NAME}" == "pull_request" ]]; then
|
||||
echo "base=${PR_BASE_SHA}" >> "$GITHUB_OUTPUT"
|
||||
elif [[ "${EVENT_CREATED}" == "true" ]]; then
|
||||
echo "base=${DEFAULT_BRANCH}" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "base=${{ github.event.before }}" >> "$GITHUB_OUTPUT"
|
||||
echo "base=${BEFORE_SHA}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
echo "ref=${{ github.sha }}" >> "$GITHUB_OUTPUT"
|
||||
echo "ref=${SHA}" >> "$GITHUB_OUTPUT"
|
||||
- name: Detect changes
|
||||
id: filter
|
||||
if: steps.force.outputs.run_all != 'true'
|
||||
@@ -59,6 +73,8 @@ jobs:
|
||||
if: needs.changes.outputs.frontend_changed == 'true'
|
||||
name: Install Dependencies
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -89,6 +105,8 @@ jobs:
|
||||
needs: [changes, install-dependencies]
|
||||
if: needs.changes.outputs.frontend_changed == 'true'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -120,6 +138,8 @@ jobs:
|
||||
needs: [changes, install-dependencies]
|
||||
if: needs.changes.outputs.frontend_changed == 'true'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -169,6 +189,8 @@ jobs:
|
||||
needs: [changes, install-dependencies]
|
||||
if: needs.changes.outputs.frontend_changed == 'true'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
container: mcr.microsoft.com/playwright:v1.58.2-noble
|
||||
env:
|
||||
PLAYWRIGHT_BROWSERS_PATH: /ms-playwright
|
||||
@@ -212,6 +234,9 @@ jobs:
|
||||
needs: [changes, unit-tests, e2e-tests]
|
||||
if: needs.changes.outputs.frontend_changed == 'true'
|
||||
runs-on: ubuntu-24.04
|
||||
environment: bundle-analysis
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
2
.github/workflows/ci-lint.yml
vendored
2
.github/workflows/ci-lint.yml
vendored
@@ -9,6 +9,8 @@ on:
|
||||
concurrency:
|
||||
group: lint-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
permissions:
|
||||
contents: read
|
||||
jobs:
|
||||
lint:
|
||||
name: Linting via prek
|
||||
|
||||
67
.github/workflows/ci-release.yml
vendored
67
.github/workflows/ci-release.yml
vendored
@@ -10,10 +10,14 @@ concurrency:
|
||||
env:
|
||||
DEFAULT_UV_VERSION: "0.10.x"
|
||||
DEFAULT_PYTHON_VERSION: "3.12"
|
||||
permissions: {}
|
||||
jobs:
|
||||
wait-for-docker:
|
||||
name: Wait for Docker Build
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
checks: read
|
||||
statuses: read
|
||||
steps:
|
||||
- name: Wait for Docker build
|
||||
uses: lewagon/wait-on-check-action@74049309dfeff245fe8009a0137eacf28136cb3c # v1.5.0
|
||||
@@ -26,6 +30,8 @@ jobs:
|
||||
name: Build Release
|
||||
needs: wait-for-docker
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -40,8 +46,7 @@ jobs:
|
||||
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
|
||||
with:
|
||||
node-version: 24.x
|
||||
cache: 'pnpm'
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
package-manager-cache: false
|
||||
- name: Install frontend dependencies
|
||||
run: cd src-ui && pnpm install
|
||||
- name: Build frontend
|
||||
@@ -56,20 +61,24 @@ jobs:
|
||||
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
enable-cache: false
|
||||
python-version: ${{ steps.setup-python.outputs.python-version }}
|
||||
- name: Install Python dependencies
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
uv sync --python ${{ steps.setup-python.outputs.python-version }} --dev --frozen
|
||||
uv sync --python "${PYTHON_VERSION}" --dev --frozen
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq --no-install-recommends gettext liblept5
|
||||
# ---- Build Documentation ----
|
||||
- name: Build documentation
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
uv run \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
--dev \
|
||||
--frozen \
|
||||
zensical build --clean
|
||||
@@ -78,16 +87,20 @@ jobs:
|
||||
run: |
|
||||
uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt
|
||||
- name: Compile messages
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
cd src/
|
||||
uv run \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
manage.py compilemessages
|
||||
- name: Collect static files
|
||||
env:
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
run: |
|
||||
cd src/
|
||||
uv run \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
manage.py collectstatic --no-input --clear
|
||||
- name: Assemble release package
|
||||
run: |
|
||||
@@ -129,6 +142,9 @@ jobs:
|
||||
name: Publish Release
|
||||
needs: build-release
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
outputs:
|
||||
prerelease: ${{ steps.get-version.outputs.prerelease }}
|
||||
changelog: ${{ steps.create-release.outputs.body }}
|
||||
@@ -141,9 +157,11 @@ jobs:
|
||||
path: ./
|
||||
- name: Get version info
|
||||
id: get-version
|
||||
env:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
run: |
|
||||
echo "version=${{ github.ref_name }}" >> $GITHUB_OUTPUT
|
||||
if [[ "${{ github.ref_name }}" == *"-beta.rc"* ]]; then
|
||||
echo "version=${REF_NAME}" >> $GITHUB_OUTPUT
|
||||
if [[ "${REF_NAME}" == *"-beta.rc"* ]]; then
|
||||
echo "prerelease=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "prerelease=false" >> $GITHUB_OUTPUT
|
||||
@@ -176,6 +194,9 @@ jobs:
|
||||
needs: publish-release
|
||||
if: needs.publish-release.outputs.prerelease == 'false'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -191,15 +212,21 @@ jobs:
|
||||
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1
|
||||
with:
|
||||
version: ${{ env.DEFAULT_UV_VERSION }}
|
||||
enable-cache: true
|
||||
enable-cache: false
|
||||
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
|
||||
- name: Update changelog
|
||||
working-directory: docs
|
||||
env:
|
||||
CHANGELOG: ${{ needs.publish-release.outputs.changelog }}
|
||||
PYTHON_VERSION: ${{ steps.setup-python.outputs.python-version }}
|
||||
VERSION: ${{ needs.publish-release.outputs.version }}
|
||||
run: |
|
||||
git branch ${{ needs.publish-release.outputs.version }}-changelog
|
||||
git checkout ${{ needs.publish-release.outputs.version }}-changelog
|
||||
branch_name="${VERSION}-changelog"
|
||||
|
||||
echo -e "# Changelog\n\n${{ needs.publish-release.outputs.changelog }}\n" > changelog-new.md
|
||||
git branch "${branch_name}"
|
||||
git checkout "${branch_name}"
|
||||
|
||||
printf '# Changelog\n\n%s\n' "${CHANGELOG}" > changelog-new.md
|
||||
|
||||
echo "Manually linking usernames"
|
||||
sed -i -r 's|@([a-zA-Z0-9_]+) \(\[#|[@\1](https://github.com/\1) ([#|g' changelog-new.md
|
||||
@@ -212,24 +239,28 @@ jobs:
|
||||
mv changelog-new.md changelog.md
|
||||
|
||||
uv run \
|
||||
--python ${{ steps.setup-python.outputs.python-version }} \
|
||||
--python "${PYTHON_VERSION}" \
|
||||
--dev \
|
||||
prek run --files changelog.md || true
|
||||
|
||||
git config --global user.name "github-actions"
|
||||
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
git commit -am "Changelog ${{ needs.publish-release.outputs.version }} - GHA"
|
||||
git push origin ${{ needs.publish-release.outputs.version }}-changelog
|
||||
git commit -am "Changelog ${VERSION} - GHA"
|
||||
git push origin "${branch_name}"
|
||||
- name: Create pull request
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
env:
|
||||
VERSION: ${{ needs.publish-release.outputs.version }}
|
||||
with:
|
||||
script: |
|
||||
const { repo, owner } = context.repo;
|
||||
const version = process.env.VERSION;
|
||||
const head = `${version}-changelog`;
|
||||
const result = await github.rest.pulls.create({
|
||||
title: 'Documentation: Add ${{ needs.publish-release.outputs.version }} changelog',
|
||||
title: `Documentation: Add ${version} changelog`,
|
||||
owner,
|
||||
repo,
|
||||
head: '${{ needs.publish-release.outputs.version }}-changelog',
|
||||
head,
|
||||
base: 'main',
|
||||
body: 'This PR is auto-generated by CI.'
|
||||
});
|
||||
|
||||
10
.github/workflows/ci-static-analysis.yml
vendored
10
.github/workflows/ci-static-analysis.yml
vendored
@@ -33,10 +33,18 @@ jobs:
|
||||
container:
|
||||
image: semgrep/semgrep:1.155.0@sha256:cc869c685dcc0fe497c86258da9f205397d8108e56d21a86082ea4886e52784d
|
||||
if: github.actor != 'dependabot[bot]'
|
||||
permissions:
|
||||
contents: read
|
||||
security-events: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
- name: Run Semgrep
|
||||
run: semgrep scan --config auto
|
||||
run: semgrep scan --config auto --sarif-output results.sarif
|
||||
- name: Upload results to GitHub code scanning
|
||||
uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1
|
||||
if: always()
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
|
||||
3
.github/workflows/cleanup-tags.yml
vendored
3
.github/workflows/cleanup-tags.yml
vendored
@@ -12,11 +12,13 @@ on:
|
||||
concurrency:
|
||||
group: registry-tags-cleanup
|
||||
cancel-in-progress: false
|
||||
permissions: {}
|
||||
jobs:
|
||||
cleanup-images:
|
||||
name: Cleanup Image Tags for ${{ matrix.primary-name }}
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
environment: registry-maintenance
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -43,6 +45,7 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
needs:
|
||||
- cleanup-images
|
||||
environment: registry-maintenance
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
||||
4
.github/workflows/crowdin.yml
vendored
4
.github/workflows/crowdin.yml
vendored
@@ -6,11 +6,15 @@ on:
|
||||
push:
|
||||
paths: ['src/locale/**', 'src-ui/messages.xlf', 'src-ui/src/locale/**']
|
||||
branches: [dev]
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
jobs:
|
||||
synchronize-with-crowdin:
|
||||
name: Crowdin Sync
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
environment: translation-sync
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
17
.github/workflows/repo-maintenance.yml
vendored
17
.github/workflows/repo-maintenance.yml
vendored
@@ -3,10 +3,6 @@ on:
|
||||
schedule:
|
||||
- cron: '0 3 * * *'
|
||||
workflow_dispatch:
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
discussions: write
|
||||
concurrency:
|
||||
group: lock
|
||||
jobs:
|
||||
@@ -14,6 +10,9 @@ jobs:
|
||||
name: 'Stale'
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0
|
||||
with:
|
||||
@@ -36,6 +35,10 @@ jobs:
|
||||
name: 'Lock Old Threads'
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
discussions: write
|
||||
steps:
|
||||
- uses: dessant/lock-threads@7266a7ce5c1df01b1c6db85bf8cd86c737dadbe7 # v6.0.0
|
||||
with:
|
||||
@@ -56,6 +59,8 @@ jobs:
|
||||
name: 'Close Answered Discussions'
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
discussions: write
|
||||
steps:
|
||||
- uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
@@ -113,6 +118,8 @@ jobs:
|
||||
name: 'Close Outdated Discussions'
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
discussions: write
|
||||
steps:
|
||||
- uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
@@ -205,6 +212,8 @@ jobs:
|
||||
name: 'Close Unsupported Feature Requests'
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
discussions: write
|
||||
steps:
|
||||
- uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
|
||||
1
.github/workflows/translate-strings.yml
vendored
1
.github/workflows/translate-strings.yml
vendored
@@ -7,6 +7,7 @@ jobs:
|
||||
generate-translate-strings:
|
||||
name: Generate Translation Strings
|
||||
runs-on: ubuntu-latest
|
||||
environment: translation-sync
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
|
||||
29
.github/zizmor.yml
vendored
Normal file
29
.github/zizmor.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
rules:
|
||||
template-injection:
|
||||
ignore:
|
||||
# github.event_name is a GitHub-internal constant (push/pull_request/etc.),
|
||||
# not attacker-controllable.
|
||||
- ci-docker.yml:74
|
||||
- ci-docs.yml:33
|
||||
# github.event.repository.default_branch refers to the target repo's setting,
|
||||
# which only admins can change; not influenced by fork PR authors.
|
||||
- ci-docs.yml:45
|
||||
# steps.setup-python.outputs.python-version is always a semver string (e.g. "3.12.0")
|
||||
# produced by actions/setup-python from a hardcoded env var input.
|
||||
- ci-docs.yml:88
|
||||
- ci-docs.yml:92
|
||||
# needs.*.result is always one of: success/failure/cancelled/skipped.
|
||||
- ci-docs.yml:131
|
||||
- ci-docs.yml:132
|
||||
# needs.changes.outputs.* is always "true" or "false".
|
||||
- ci-docs.yml:126
|
||||
# steps.build.outputs.digest is always a SHA256 digest (sha256:[a-f0-9]{64}).
|
||||
- ci-docker.yml:152
|
||||
dangerous-triggers:
|
||||
ignore:
|
||||
# Both workflows use pull_request_target solely to label/comment on fork PRs
|
||||
# (requires write-back access unavailable to pull_request). Neither workflow
|
||||
# checks out PR code or executes anything from the fork — only reads PR
|
||||
# metadata via context/API. Permissions are scoped to pull-requests: write.
|
||||
- pr-bot.yml:2
|
||||
- project-actions.yml:2
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -111,3 +111,4 @@ celerybeat-schedule*
|
||||
|
||||
# ignore pnpm package store folder created when setting up the devcontainer
|
||||
.pnpm-store/
|
||||
.worktrees
|
||||
|
||||
@@ -821,11 +821,14 @@ parsing documents.
|
||||
|
||||
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
||||
|
||||
: Tell paperless when and how to perform ocr on your documents. Three
|
||||
: Tell paperless when and how to perform ocr on your documents. Four
|
||||
modes are available:
|
||||
|
||||
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||
pages where no text is present. This is the safest option.
|
||||
- `auto` (default): Paperless detects whether a document already
|
||||
has embedded text via pdftotext. If sufficient text is found,
|
||||
OCR is skipped for that document (`--skip-text`). If no text is
|
||||
present, OCR runs normally. This is the safest option for mixed
|
||||
document collections.
|
||||
|
||||
- `redo`: Paperless will OCR all pages of your documents and
|
||||
attempt to replace any existing text layers with new text. This
|
||||
@@ -843,24 +846,59 @@ modes are available:
|
||||
significantly larger and text won't appear as sharp when zoomed
|
||||
in.
|
||||
|
||||
The default is `skip`, which only performs OCR when necessary and
|
||||
always creates archived documents.
|
||||
- `off`: Paperless never invokes the OCR engine. For PDFs, text
|
||||
is extracted via pdftotext only. For image documents, text will
|
||||
be empty. Archive file generation still works via format
|
||||
conversion (no Tesseract or Ghostscript required).
|
||||
|
||||
Read more about this in the [OCRmyPDF
|
||||
The default is `auto`.
|
||||
|
||||
For the `skip`, `redo`, and `force` modes, read more about OCR
|
||||
behaviour in the [OCRmyPDF
|
||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||
|
||||
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
|
||||
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
|
||||
|
||||
: Specify when you would like paperless to skip creating an archived
|
||||
version of your documents. This is useful if you don't want to have two
|
||||
almost-identical versions of your documents in the media folder.
|
||||
: Controls when paperless creates a PDF/A archive version of your
|
||||
documents. Archive files are stored alongside the original and are used
|
||||
for display in the web interface.
|
||||
|
||||
- `never`: Never skip creating an archived version.
|
||||
- `with_text`: Skip creating an archived version for documents
|
||||
that already have embedded text.
|
||||
- `always`: Always skip creating an archived version.
|
||||
- `auto` (default): Produce archives for scanned or image-based
|
||||
documents. Skip archive generation for born-digital PDFs that
|
||||
already contain embedded text. This is the recommended setting
|
||||
for mixed document collections.
|
||||
- `always`: Always produce a PDF/A archive when the parser
|
||||
supports it, regardless of whether the document already has
|
||||
text.
|
||||
- `never`: Never produce an archive. Only the original file is
|
||||
stored. Saves disk space but the web viewer will display the
|
||||
original file directly.
|
||||
|
||||
The default is `never`.
|
||||
**Behaviour by file type and mode** (`auto` column shows the default):
|
||||
|
||||
| Document type | `never` | `auto` (default) | `always` |
|
||||
| -------------------------- | ------- | -------------------------- | -------- |
|
||||
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
|
||||
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
|
||||
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
|
||||
| Plain text, email, HTML | No | No | No |
|
||||
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |
|
||||
|
||||
\* Tika always produces a PDF rendition for display; this counts as
|
||||
the archive regardless of the setting.
|
||||
|
||||
!!! note
|
||||
|
||||
This setting applies to the built-in Tesseract parser. Parsers
|
||||
that must always convert documents to PDF for display (e.g. DOCX,
|
||||
ODT via Tika) will produce a PDF regardless of this setting.
|
||||
|
||||
!!! note
|
||||
|
||||
The **remote OCR parser** (Azure AI) always produces a searchable
|
||||
PDF and stores it as the archive copy, regardless of this setting.
|
||||
`ARCHIVE_FILE_GENERATION=never` has no effect when the remote
|
||||
parser handles a document.
|
||||
|
||||
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
||||
|
||||
|
||||
@@ -123,7 +123,68 @@ Multiple options are combined in a single value:
|
||||
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
||||
```
|
||||
|
||||
## Search Index (Whoosh -> Tantivy)
|
||||
## OCR and Archive File Generation Settings
|
||||
|
||||
The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — old values are not silently honoured; a startup warning is logged if any removed variable is still set in your environment.
|
||||
|
||||
### Removed settings
|
||||
|
||||
| Removed Setting | Replacement |
|
||||
| ------------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) |
|
||||
| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
||||
|
||||
### What changed and why
|
||||
|
||||
Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives.
|
||||
|
||||
The new settings are independent:
|
||||
|
||||
- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`.
|
||||
- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`.
|
||||
|
||||
### Database configuration
|
||||
|
||||
If you changed OCR settings via the admin UI (ApplicationConfiguration), the database values are **migrated automatically** during the upgrade. `mode` values (`skip` / `skip_noarchive`) are mapped to their new equivalents and `skip_archive_file` values are converted to the new `archive_file_generation` field. After upgrading, review the OCR settings in the admin UI to confirm the migrated values match your intent.
|
||||
|
||||
### Action Required
|
||||
|
||||
Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly:
|
||||
|
||||
```bash
|
||||
# v2: skip OCR when text present, always archive
|
||||
PAPERLESS_OCR_MODE=skip
|
||||
# v3: equivalent (auto is the new default)
|
||||
# No change needed — auto is the default
|
||||
|
||||
# v2: skip OCR when text present, skip archive too
|
||||
PAPERLESS_OCR_MODE=skip_noarchive
|
||||
# v3: equivalent
|
||||
PAPERLESS_OCR_MODE=auto
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: always skip archive
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always
|
||||
# v3: equivalent
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
||||
|
||||
# v2: skip archive only for born-digital docs
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text
|
||||
# v3: equivalent (auto is the new default)
|
||||
PAPERLESS_ARCHIVE_FILE_GENERATION=auto
|
||||
```
|
||||
|
||||
### Remote OCR parser
|
||||
|
||||
If you use the **remote OCR parser** (Azure AI), note that it always produces a
|
||||
searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never`
|
||||
has no effect for documents handled by the remote parser — the archive is produced
|
||||
unconditionally by the remote engine.
|
||||
|
||||
# Search Index (Whoosh -> Tantivy)
|
||||
|
||||
The full-text search backend has been replaced with [Tantivy](https://github.com/quickwit-oss/tantivy).
|
||||
The index format is incompatible with Whoosh, so **the search index is automatically rebuilt from
|
||||
|
||||
@@ -633,12 +633,11 @@ hardware, but a few settings can improve performance:
|
||||
consumption, so you might want to lower these settings (example: 2
|
||||
workers and 1 thread to always have some computing power left for
|
||||
other tasks).
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider
|
||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider
|
||||
OCRing your documents before feeding them into Paperless. Some
|
||||
scanners are able to do this!
|
||||
- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive
|
||||
file generation for already OCRed documents, or `always` to skip it
|
||||
for all documents.
|
||||
- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive
|
||||
file generation entirely, saving disk space at the cost of in-browser PDF/A viewing.
|
||||
- If you want to perform OCR on the device, consider using
|
||||
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
||||
less memory at the expense of slightly worse OCR results.
|
||||
|
||||
@@ -134,9 +134,9 @@ following operations on your documents:
|
||||
!!! tip
|
||||
|
||||
This process can be configured to fit your needs. If you don't want
|
||||
paperless to create archived versions for digital documents, you can
|
||||
configure that by configuring
|
||||
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
|
||||
paperless to create archived versions for born-digital documents, set
|
||||
[`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION)
|
||||
(the default). To skip archives entirely, use `never`. Please read the
|
||||
[relevant section in the documentation](configuration.md#ocr).
|
||||
|
||||
!!! note
|
||||
@@ -457,7 +457,7 @@ fields and permissions, which will be merged.
|
||||
|
||||
#### Types {#workflow-trigger-types}
|
||||
|
||||
Currently, there are five events that correspond to workflow trigger 'types':
|
||||
Currently, there are four events that correspond to workflow trigger 'types':
|
||||
|
||||
1. **Consumption Started**: _before_ a document is consumed, so events can include filters by source (mail, consumption
|
||||
folder or API), file path, file name, mail rule
|
||||
@@ -469,10 +469,8 @@ Currently, there are five events that correspond to workflow trigger 'types':
|
||||
4. **Scheduled**: a scheduled trigger that can be used to run workflows at a specific time. The date used can be either the document
|
||||
added, created, updated date or you can specify a (date) custom field. You can also specify a day offset from the date (positive
|
||||
offsets will trigger after the date, negative offsets will trigger before).
|
||||
5. **Version Added**: when a new version is added for an existing document. This trigger evaluates filters against the root document
|
||||
and applies actions to the root document.
|
||||
|
||||
The following flow diagram illustrates the document trigger types:
|
||||
The following flow diagram illustrates the four document trigger types:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
@@ -488,10 +486,6 @@ flowchart TD
|
||||
'Updated'
|
||||
trigger(s)"}
|
||||
|
||||
version{"Matching
|
||||
'Version Added'
|
||||
trigger(s)"}
|
||||
|
||||
scheduled{"Documents
|
||||
matching
|
||||
trigger(s)"}
|
||||
@@ -508,15 +502,11 @@ flowchart TD
|
||||
updated --> |Yes| J[Workflow Actions Run]
|
||||
updated --> |No| K
|
||||
J --> K[Document Saved]
|
||||
L[New Document Version Added] --> version
|
||||
version --> |Yes| V[Workflow Actions Run]
|
||||
version --> |No| W
|
||||
V --> W[Document Saved]
|
||||
X[Scheduled Task Check<br/>hourly at :05] --> Y[Get All Scheduled Triggers]
|
||||
Y --> scheduled
|
||||
scheduled --> |Yes| Z[Workflow Actions Run]
|
||||
scheduled --> |No| AA[Document Saved]
|
||||
Z --> AA
|
||||
L[Scheduled Task Check<br/>hourly at :05] --> M[Get All Scheduled Triggers]
|
||||
M --> scheduled
|
||||
scheduled --> |Yes| N[Workflow Actions Run]
|
||||
scheduled --> |No| O[Document Saved]
|
||||
N --> O
|
||||
```
|
||||
|
||||
#### Filters {#workflow-trigger-filters}
|
||||
|
||||
@@ -10456,8 +10456,8 @@
|
||||
<context context-type="linenumber">111</context>
|
||||
</context-group>
|
||||
</trans-unit>
|
||||
<trans-unit id="6114528299376689399" datatype="html">
|
||||
<source>Skip Archive File</source>
|
||||
<trans-unit id="8305051609904776938" datatype="html">
|
||||
<source>Archive File Generation</source>
|
||||
<context-group purpose="location">
|
||||
<context context-type="sourcefile">src/app/data/paperless-config.ts</context>
|
||||
<context context-type="linenumber">119</context>
|
||||
|
||||
18
src-ui/pnpm-lock.yaml
generated
18
src-ui/pnpm-lock.yaml
generated
@@ -1624,8 +1624,8 @@ packages:
|
||||
'@harperfast/extended-iterable@1.0.3':
|
||||
resolution: {integrity: sha512-sSAYhQca3rDWtQUHSAPeO7axFIUJOI6hn1gjRC5APVE1a90tuyT8f5WIgRsFhhWA7htNkju2veB9eWL6YHi/Lw==}
|
||||
|
||||
'@hono/node-server@1.19.12':
|
||||
resolution: {integrity: sha512-txsUW4SQ1iilgE0l9/e9VQWmELXifEFvmdA1j6WFh/aFPj99hIntrSsq/if0UWyGVkmrRPKA1wCeP+UCr1B9Uw==}
|
||||
'@hono/node-server@1.19.13':
|
||||
resolution: {integrity: sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ==}
|
||||
engines: {node: '>=18.14.1'}
|
||||
peerDependencies:
|
||||
hono: ^4
|
||||
@@ -4380,8 +4380,8 @@ packages:
|
||||
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
hono@4.12.9:
|
||||
resolution: {integrity: sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA==}
|
||||
hono@4.12.12:
|
||||
resolution: {integrity: sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==}
|
||||
engines: {node: '>=16.9.0'}
|
||||
|
||||
hosted-git-info@9.0.2:
|
||||
@@ -8515,9 +8515,9 @@ snapshots:
|
||||
'@harperfast/extended-iterable@1.0.3':
|
||||
optional: true
|
||||
|
||||
'@hono/node-server@1.19.12(hono@4.12.9)':
|
||||
'@hono/node-server@1.19.13(hono@4.12.12)':
|
||||
dependencies:
|
||||
hono: 4.12.9
|
||||
hono: 4.12.12
|
||||
|
||||
'@humanfs/core@0.19.1': {}
|
||||
|
||||
@@ -9139,7 +9139,7 @@ snapshots:
|
||||
|
||||
'@modelcontextprotocol/sdk@1.26.0(zod@4.3.6)':
|
||||
dependencies:
|
||||
'@hono/node-server': 1.19.12(hono@4.12.9)
|
||||
'@hono/node-server': 1.19.13(hono@4.12.12)
|
||||
ajv: 8.18.0
|
||||
ajv-formats: 3.0.1(ajv@8.18.0)
|
||||
content-type: 1.0.5
|
||||
@@ -9149,7 +9149,7 @@ snapshots:
|
||||
eventsource-parser: 3.0.6
|
||||
express: 5.2.1
|
||||
express-rate-limit: 8.3.2(express@5.2.1)
|
||||
hono: 4.12.9
|
||||
hono: 4.12.12
|
||||
jose: 6.2.2
|
||||
json-schema-typed: 8.0.2
|
||||
pkce-challenge: 5.0.1
|
||||
@@ -11392,7 +11392,7 @@ snapshots:
|
||||
dependencies:
|
||||
function-bind: 1.1.2
|
||||
|
||||
hono@4.12.9: {}
|
||||
hono@4.12.12: {}
|
||||
|
||||
hosted-git-info@9.0.2:
|
||||
dependencies:
|
||||
|
||||
@@ -164,7 +164,7 @@
|
||||
<pngx-input-text i18n-title title="Filter path" formControlName="filter_path" horizontal="true" i18n-hint hint="Apply to documents that match this path. Wildcards specified as * are allowed. Case-normalized." [error]="error?.filter_path"></pngx-input-text>
|
||||
<pngx-input-select i18n-title title="Filter mail rule" [items]="mailRules" horizontal="true" [allowNull]="true" formControlName="filter_mailrule" i18n-hint hint="Apply to documents consumed via this mail rule." [error]="error?.filter_mailrule"></pngx-input-select>
|
||||
}
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled || formGroup.get('type').value === WorkflowTriggerType.VersionAdded) {
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled) {
|
||||
<pngx-input-select i18n-title title="Content matching algorithm" horizontal="true" [items]="getMatchingAlgorithms()" formControlName="matching_algorithm"></pngx-input-select>
|
||||
@if (matchingPatternRequired(formGroup)) {
|
||||
<pngx-input-text i18n-title title="Content matching pattern" horizontal="true" formControlName="match" [error]="error?.match"></pngx-input-text>
|
||||
@@ -175,7 +175,7 @@
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled || formGroup.get('type').value === WorkflowTriggerType.VersionAdded) {
|
||||
@if (formGroup.get('type').value === WorkflowTriggerType.DocumentAdded || formGroup.get('type').value === WorkflowTriggerType.DocumentUpdated || formGroup.get('type').value === WorkflowTriggerType.Scheduled) {
|
||||
<div class="row mt-3">
|
||||
<div class="col">
|
||||
<div class="trigger-filters mb-3">
|
||||
|
||||
@@ -120,10 +120,6 @@ export const WORKFLOW_TYPE_OPTIONS = [
|
||||
id: WorkflowTriggerType.Scheduled,
|
||||
name: $localize`Scheduled`,
|
||||
},
|
||||
{
|
||||
id: WorkflowTriggerType.VersionAdded,
|
||||
name: $localize`Version Added`,
|
||||
},
|
||||
]
|
||||
|
||||
export const WORKFLOW_ACTION_OPTIONS = [
|
||||
|
||||
@@ -11,16 +11,16 @@ export enum OutputTypeConfig {
|
||||
}
|
||||
|
||||
export enum ModeConfig {
|
||||
SKIP = 'skip',
|
||||
REDO = 'redo',
|
||||
AUTO = 'auto',
|
||||
FORCE = 'force',
|
||||
SKIP_NO_ARCHIVE = 'skip_noarchive',
|
||||
REDO = 'redo',
|
||||
OFF = 'off',
|
||||
}
|
||||
|
||||
export enum ArchiveFileConfig {
|
||||
NEVER = 'never',
|
||||
WITH_TEXT = 'with_text',
|
||||
AUTO = 'auto',
|
||||
ALWAYS = 'always',
|
||||
NEVER = 'never',
|
||||
}
|
||||
|
||||
export enum CleanConfig {
|
||||
@@ -115,11 +115,11 @@ export const PaperlessConfigOptions: ConfigOption[] = [
|
||||
category: ConfigCategory.OCR,
|
||||
},
|
||||
{
|
||||
key: 'skip_archive_file',
|
||||
title: $localize`Skip Archive File`,
|
||||
key: 'archive_file_generation',
|
||||
title: $localize`Archive File Generation`,
|
||||
type: ConfigOptionType.Select,
|
||||
choices: mapToItems(ArchiveFileConfig),
|
||||
config_key: 'PAPERLESS_OCR_SKIP_ARCHIVE_FILE',
|
||||
config_key: 'PAPERLESS_ARCHIVE_FILE_GENERATION',
|
||||
category: ConfigCategory.OCR,
|
||||
},
|
||||
{
|
||||
@@ -337,7 +337,7 @@ export interface PaperlessConfig extends ObjectWithId {
|
||||
pages: number
|
||||
language: string
|
||||
mode: ModeConfig
|
||||
skip_archive_file: ArchiveFileConfig
|
||||
archive_file_generation: ArchiveFileConfig
|
||||
image_dpi: number
|
||||
unpaper_clean: CleanConfig
|
||||
deskew: boolean
|
||||
|
||||
@@ -12,7 +12,6 @@ export enum WorkflowTriggerType {
|
||||
DocumentAdded = 2,
|
||||
DocumentUpdated = 3,
|
||||
Scheduled = 4,
|
||||
VersionAdded = 5,
|
||||
}
|
||||
|
||||
export enum ScheduleDateField {
|
||||
|
||||
@@ -10,13 +10,11 @@ class DocumentsConfig(AppConfig):
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_updated
|
||||
from documents.signals import document_version_added
|
||||
from documents.signals.handlers import add_inbox_tags
|
||||
from documents.signals.handlers import add_or_update_document_in_llm_index
|
||||
from documents.signals.handlers import add_to_index
|
||||
from documents.signals.handlers import run_workflows_added
|
||||
from documents.signals.handlers import run_workflows_updated
|
||||
from documents.signals.handlers import run_workflows_version_added
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.signals.handlers import set_correspondent
|
||||
from documents.signals.handlers import set_document_type
|
||||
@@ -30,7 +28,6 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(set_storage_path)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_workflows_added)
|
||||
document_version_added.connect(run_workflows_version_added)
|
||||
document_consumption_finished.connect(add_or_update_document_in_llm_index)
|
||||
document_updated.connect(run_workflows_updated)
|
||||
document_updated.connect(send_websocket_document_updated)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -44,16 +45,20 @@ from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals import document_version_added
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
@@ -106,6 +111,74 @@ class ConsumerStatusShortMessage(StrEnum):
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
def should_produce_archive(
|
||||
parser: "ParserProtocol",
|
||||
mime_type: str,
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> bool:
|
||||
"""Return True if a PDF/A archive should be produced for this document.
|
||||
|
||||
IMPORTANT: *parser* must be an instantiated parser, not the class.
|
||||
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
|
||||
``@property`` methods — accessing them on the class returns the descriptor
|
||||
(always truthy).
|
||||
"""
|
||||
_log = log or logging.getLogger(LOGGING_NAME)
|
||||
|
||||
# Must produce a PDF so the frontend can display the original format at all.
|
||||
if parser.requires_pdf_rendition:
|
||||
_log.debug("Archive: yes — parser requires PDF rendition for frontend display")
|
||||
return True
|
||||
|
||||
# Parser cannot produce an archive (e.g. TextDocumentParser).
|
||||
if not parser.can_produce_archive:
|
||||
_log.debug("Archive: no — parser cannot produce archives")
|
||||
return False
|
||||
|
||||
generation = OcrConfig().archive_file_generation
|
||||
|
||||
if generation == ArchiveFileGenerationChoices.ALWAYS:
|
||||
_log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always")
|
||||
return True
|
||||
if generation == ArchiveFileGenerationChoices.NEVER:
|
||||
_log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never")
|
||||
return False
|
||||
|
||||
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
|
||||
if mime_type.startswith("image/"):
|
||||
_log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto")
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
if is_tagged_pdf(document_path):
|
||||
_log.debug(
|
||||
"Archive: no — born-digital PDF (structure tags detected),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
)
|
||||
return False
|
||||
text = extract_pdf_text(document_path)
|
||||
if text is None or len(text) <= PDF_TEXT_MIN_LENGTH:
|
||||
_log.debug(
|
||||
"Archive: yes — scanned PDF (text_length=%d ≤ %d),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
len(text) if text else 0,
|
||||
PDF_TEXT_MIN_LENGTH,
|
||||
)
|
||||
return True
|
||||
_log.debug(
|
||||
"Archive: no — born-digital PDF (text_length=%d > %d),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
len(text),
|
||||
PDF_TEXT_MIN_LENGTH,
|
||||
)
|
||||
return False
|
||||
_log.debug(
|
||||
"Archive: no — MIME type %r not eligible for auto archive generation",
|
||||
mime_type,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
class ConsumerPluginMixin:
|
||||
if TYPE_CHECKING:
|
||||
from logging import Logger
|
||||
@@ -437,7 +510,17 @@ class ConsumerPlugin(
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
produce_archive = should_produce_archive(
|
||||
document_parser,
|
||||
mime_type,
|
||||
self.working_copy,
|
||||
self.log,
|
||||
)
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
@@ -577,13 +660,6 @@ class ConsumerPlugin(
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_version_added.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
@@ -793,7 +869,7 @@ class ConsumerPlugin(
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document) -> None:
|
||||
def apply_overrides(self, document: Document) -> None:
|
||||
if self.metadata.correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.metadata.correspondent_id,
|
||||
|
||||
@@ -689,7 +689,6 @@ def document_matches_workflow(
|
||||
trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.SCHEDULED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED
|
||||
):
|
||||
trigger_matched, reason = existing_document_matches_workflow(
|
||||
document,
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
# Generated by Django 5.2.7 on 2026-03-02 00:00
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0018_saved_view_simple_search_rules"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="workflowtrigger",
|
||||
name="type",
|
||||
field=models.PositiveSmallIntegerField(
|
||||
choices=[
|
||||
(1, "Consumption Started"),
|
||||
(2, "Document Added"),
|
||||
(3, "Document Updated"),
|
||||
(4, "Scheduled"),
|
||||
(5, "Version Added"),
|
||||
],
|
||||
default=1,
|
||||
verbose_name="Workflow Trigger Type",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -1183,7 +1183,6 @@ class WorkflowTrigger(models.Model):
|
||||
DOCUMENT_ADDED = 2, _("Document Added")
|
||||
DOCUMENT_UPDATED = 3, _("Document Updated")
|
||||
SCHEDULED = 4, _("Scheduled")
|
||||
VERSION_ADDED = 5, _("Version Added")
|
||||
|
||||
class DocumentSourceChoices(models.IntegerChoices):
|
||||
CONSUME_FOLDER = DocumentSource.ConsumeFolder.value, _("Consume Folder")
|
||||
|
||||
@@ -3,4 +3,3 @@ from django.dispatch import Signal
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_updated = Signal()
|
||||
document_version_added = Signal()
|
||||
|
||||
@@ -814,19 +814,6 @@ def run_workflows_added(
|
||||
)
|
||||
|
||||
|
||||
def run_workflows_version_added(
|
||||
sender,
|
||||
document: Document,
|
||||
logging_group: uuid.UUID | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
run_workflows(
|
||||
trigger_type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
document=document.root_document,
|
||||
logging_group=logging_group,
|
||||
)
|
||||
|
||||
|
||||
def run_workflows_updated(
|
||||
sender,
|
||||
document: Document,
|
||||
|
||||
@@ -30,6 +30,7 @@ from documents.consumer import AsnCheckPlugin
|
||||
from documents.consumer import ConsumerPlugin
|
||||
from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.consumer import should_produce_archive
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.double_sided import CollatePlugin
|
||||
@@ -311,7 +312,16 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
produce_archive = should_produce_archive(
|
||||
parser,
|
||||
mime_type,
|
||||
document.source_path,
|
||||
)
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
"pages": None,
|
||||
"language": None,
|
||||
"mode": None,
|
||||
"skip_archive_file": None,
|
||||
"archive_file_generation": None,
|
||||
"image_dpi": None,
|
||||
"unpaper_clean": None,
|
||||
"deskew": None,
|
||||
|
||||
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
|
||||
CONSUMER_TAG_BARCODE_SPLIT=True,
|
||||
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
||||
CELERY_TASK_ALWAYS_EAGER=True,
|
||||
OCR_MODE="skip",
|
||||
OCR_MODE="auto",
|
||||
)
|
||||
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -230,7 +230,11 @@ class TestConsumer(
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT=None,
|
||||
TIME_ZONE="America/Chicago",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
def testNormalOperation(self) -> None:
|
||||
filename = self.get_test_file()
|
||||
|
||||
@@ -629,7 +633,10 @@ class TestConsumer(
|
||||
# Database empty
|
||||
self.assertEqual(Document.objects.all().count(), 0)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
def testFilenameHandling(self) -> None:
|
||||
with self.get_consumer(
|
||||
self.get_test_file(),
|
||||
@@ -646,7 +653,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@mock.patch("documents.consumer.generate_unique_filename")
|
||||
@override_settings(FILENAME_FORMAT="{pk}")
|
||||
@override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always")
|
||||
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
||||
m.side_effect = lambda doc, archive_filename=False: Path(
|
||||
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
||||
@@ -673,7 +680,10 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m) -> None:
|
||||
filenames = ["this", "that", "now this", "i cannot decide"]
|
||||
@@ -720,16 +730,9 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@mock.patch("documents.consumer.document_updated.send")
|
||||
@mock.patch("documents.consumer.document_version_added.send")
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def test_consume_version_creates_new_version(
|
||||
self,
|
||||
mock_load_classifier: mock.Mock,
|
||||
mock_document_version_added_send: mock.Mock,
|
||||
mock_document_updated_send: mock.Mock,
|
||||
) -> None:
|
||||
mock_load_classifier.return_value = MagicMock()
|
||||
def test_consume_version_creates_new_version(self, m) -> None:
|
||||
m.return_value = MagicMock()
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
@@ -797,16 +800,6 @@ class TestConsumer(
|
||||
self.assertIsNone(version.archive_serial_number)
|
||||
self.assertEqual(version.original_filename, version_file.name)
|
||||
self.assertTrue(bool(version.content))
|
||||
mock_document_version_added_send.assert_called_once()
|
||||
self.assertEqual(
|
||||
mock_document_version_added_send.call_args.kwargs["document"].id,
|
||||
version.id,
|
||||
)
|
||||
mock_document_updated_send.assert_called_once()
|
||||
self.assertEqual(
|
||||
mock_document_updated_send.call_args.kwargs["document"].id,
|
||||
root_doc.id,
|
||||
)
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
@@ -1038,7 +1031,7 @@ class TestConsumer(
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
@@ -1149,6 +1142,7 @@ class TestConsumer(
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
|
||||
@@ -1296,7 +1290,14 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
def test_no_pre_consume_script(self, m) -> None:
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
m.assert_not_called()
|
||||
# Verify no pre-consume script subprocess was invoked
|
||||
# (run_subprocess may still be called by _extract_text_for_archive_check)
|
||||
script_calls = [
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
|
||||
]
|
||||
self.assertEqual(script_calls, [])
|
||||
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
||||
@@ -1312,9 +1313,16 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
with self.get_consumer(self.test_file) as c:
|
||||
c.run()
|
||||
|
||||
m.assert_called_once()
|
||||
self.assertTrue(m.called)
|
||||
|
||||
args, _ = m.call_args
|
||||
# Find the call that invoked the pre-consume script
|
||||
# (run_subprocess may also be called by _extract_text_for_archive_check)
|
||||
script_call = next(
|
||||
call
|
||||
for call in m.call_args_list
|
||||
if call.args and call.args[0] and call.args[0][0] == script.name
|
||||
)
|
||||
args, _ = script_call
|
||||
|
||||
command = args[0]
|
||||
environment = args[1]
|
||||
|
||||
189
src/documents/tests/test_consumer_archive.py
Normal file
189
src/documents/tests/test_consumer_archive.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for should_produce_archive()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from documents.consumer import should_produce_archive
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
def _parser_instance(
|
||||
*,
|
||||
can_produce: bool = True,
|
||||
requires_rendition: bool = False,
|
||||
) -> MagicMock:
|
||||
"""Return a mock parser instance with the given capability flags."""
|
||||
instance = MagicMock()
|
||||
instance.can_produce_archive = can_produce
|
||||
instance.requires_pdf_rendition = requires_rendition
|
||||
return instance
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_app_config(mocker, null_app_config):
|
||||
"""Patch BaseConfig._get_config_instance for all tests in this module."""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
|
||||
class TestShouldProduceArchive:
|
||||
@pytest.mark.parametrize(
|
||||
("generation", "can_produce", "requires_rendition", "mime", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
False,
|
||||
id="never-returns-false",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
True,
|
||||
False,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="always-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"never",
|
||||
True,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-overrides-never",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
False,
|
||||
"text/plain",
|
||||
False,
|
||||
id="cannot-produce-overrides-always",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
True,
|
||||
"application/pdf",
|
||||
True,
|
||||
id="requires-rendition-wins-even-if-cannot-produce",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"image/tiff",
|
||||
True,
|
||||
id="auto-image-returns-true",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
False,
|
||||
"message/rfc822",
|
||||
False,
|
||||
id="auto-non-pdf-non-image-returns-false",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generation_setting(
|
||||
self,
|
||||
settings,
|
||||
generation: str,
|
||||
can_produce: bool, # noqa: FBT001
|
||||
requires_rendition: bool, # noqa: FBT001
|
||||
mime: str,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = generation
|
||||
parser = _parser_instance(
|
||||
can_produce=can_produce,
|
||||
requires_rendition=requires_rendition,
|
||||
)
|
||||
assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("extracted_text", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"This is a born-digital PDF with lots of text content. " * 10,
|
||||
False,
|
||||
id="born-digital-long-text-skips-archive",
|
||||
),
|
||||
pytest.param(None, True, id="no-text-scanned-produces-archive"),
|
||||
pytest.param("tiny", True, id="short-text-treated-as-scanned"),
|
||||
],
|
||||
)
|
||||
def test_auto_pdf_archive_decision(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
extracted_text: str | None,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
|
||||
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is expected
|
||||
)
|
||||
|
||||
def test_tagged_pdf_skips_archive_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is False
|
||||
)
|
||||
|
||||
def test_tagged_pdf_does_not_call_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
mock_extract.assert_not_called()
|
||||
@@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def make_models(self):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -213,6 +213,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -61,7 +61,6 @@ from documents.models import WorkflowTrigger
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.serialisers import WorkflowTriggerSerializer
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_version_added
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
@@ -1903,53 +1902,6 @@ class TestWorkflows(
|
||||
).exists(),
|
||||
)
|
||||
|
||||
def test_version_added_workflow_runs_on_root_document(self) -> None:
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
)
|
||||
action = WorkflowAction.objects.create(
|
||||
assign_title="Updated by version",
|
||||
assign_owner=self.user2,
|
||||
)
|
||||
workflow = Workflow.objects.create(
|
||||
name="Version workflow",
|
||||
order=0,
|
||||
)
|
||||
workflow.triggers.add(trigger)
|
||||
workflow.actions.add(action)
|
||||
|
||||
root_doc = Document.objects.create(
|
||||
title="root",
|
||||
correspondent=self.c,
|
||||
original_filename="root.pdf",
|
||||
)
|
||||
version_doc = Document.objects.create(
|
||||
title="version",
|
||||
correspondent=self.c,
|
||||
original_filename="version.pdf",
|
||||
root_document=root_doc,
|
||||
)
|
||||
|
||||
document_version_added.send(
|
||||
sender=self.__class__,
|
||||
document=version_doc,
|
||||
)
|
||||
|
||||
root_doc.refresh_from_db()
|
||||
version_doc.refresh_from_db()
|
||||
|
||||
self.assertEqual(root_doc.title, "Updated by version")
|
||||
self.assertEqual(root_doc.owner, self.user2)
|
||||
self.assertIsNone(version_doc.owner)
|
||||
self.assertEqual(
|
||||
WorkflowRun.objects.filter(
|
||||
workflow=workflow,
|
||||
type=WorkflowTrigger.WorkflowTriggerType.VERSION_ADDED,
|
||||
document=root_doc,
|
||||
).count(),
|
||||
1,
|
||||
)
|
||||
|
||||
def test_document_updated_workflow(self) -> None:
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED,
|
||||
|
||||
@@ -2,7 +2,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2026-04-03 20:54+0000\n"
|
||||
"POT-Creation-Date: 2026-04-06 22:51+0000\n"
|
||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: English\n"
|
||||
@@ -1666,32 +1666,28 @@ msgstr ""
|
||||
msgid "pdfa-3"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:39
|
||||
msgid "skip"
|
||||
#: paperless/models.py:39 paperless/models.py:50
|
||||
msgid "auto"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:40
|
||||
msgid "redo"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:41
|
||||
msgid "force"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:42
|
||||
msgid "skip_noarchive"
|
||||
#: paperless/models.py:41
|
||||
msgid "redo"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:50
|
||||
msgid "never"
|
||||
#: paperless/models.py:42
|
||||
msgid "off"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:51
|
||||
msgid "with_text"
|
||||
msgid "always"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:52
|
||||
msgid "always"
|
||||
msgid "never"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:60
|
||||
@@ -1755,7 +1751,7 @@ msgid "Sets the OCR mode"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:130
|
||||
msgid "Controls the generation of an archive file"
|
||||
msgid "Controls archive file generation"
|
||||
msgstr ""
|
||||
|
||||
#: paperless/models.py:138
|
||||
|
||||
@@ -5,6 +5,7 @@ import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
@@ -22,7 +23,7 @@ writeable_hint = (
|
||||
)
|
||||
|
||||
|
||||
def path_check(var, directory: Path) -> list[Error]:
|
||||
def path_check(var: str, directory: Path) -> list[Error]:
|
||||
messages: list[Error] = []
|
||||
if directory:
|
||||
if not directory.is_dir():
|
||||
@@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
"""
|
||||
Check the various paths for existence, readability and writeability
|
||||
"""
|
||||
@@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def binaries_check(app_configs, **kwargs):
|
||||
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
"""
|
||||
Paperless requires the existence of a few binaries, so we do some checks
|
||||
for those here.
|
||||
@@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def debug_mode_check(app_configs, **kwargs):
|
||||
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
||||
if settings.DEBUG:
|
||||
return [
|
||||
Warning(
|
||||
@@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def settings_values_check(app_configs, **kwargs):
|
||||
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
|
||||
"""
|
||||
Validates at least some of the user provided settings
|
||||
"""
|
||||
@@ -132,23 +133,14 @@ def settings_values_check(app_configs, **kwargs):
|
||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||
)
|
||||
|
||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
msgs.append(
|
||||
Warning(
|
||||
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||
"removed in a future version. Please use "
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||
),
|
||||
)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||
if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}:
|
||||
msgs.append(
|
||||
Error(
|
||||
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION setting "
|
||||
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
|
||||
),
|
||||
)
|
||||
|
||||
@@ -191,7 +183,7 @@ def settings_values_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def audit_log_check(app_configs, **kwargs):
|
||||
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
db_conn = connections["default"]
|
||||
all_tables = db_conn.introspection.table_names()
|
||||
result = []
|
||||
@@ -303,7 +295,42 @@ def check_deprecated_db_settings(
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
def check_deprecated_v2_ocr_env_vars(
|
||||
app_configs: object,
|
||||
**kwargs: object,
|
||||
) -> list[Warning]:
|
||||
"""Warn when deprecated v2 OCR environment variables are set.
|
||||
|
||||
Users upgrading from v2 may still have these in their environment or
|
||||
config files, where they are now silently ignored.
|
||||
"""
|
||||
warnings: list[Warning] = []
|
||||
|
||||
if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"):
|
||||
warnings.append(
|
||||
Warning(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. "
|
||||
"Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.",
|
||||
id="paperless.W002",
|
||||
),
|
||||
)
|
||||
|
||||
ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "")
|
||||
if ocr_mode in {"skip", "skip_noarchive"}:
|
||||
warnings.append(
|
||||
Warning(
|
||||
f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. "
|
||||
f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never "
|
||||
f"if you used skip_noarchive) instead.",
|
||||
id="paperless.W003",
|
||||
),
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
@@ -329,7 +356,7 @@ def get_tesseract_langs():
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
|
||||
@@ -4,6 +4,11 @@ import json
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig):
|
||||
Almost all parsers care about the chosen PDF output format
|
||||
"""
|
||||
|
||||
output_type: str = dataclasses.field(init=False)
|
||||
output_type: OutputTypeChoices = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
app_config = self._get_config_instance()
|
||||
@@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig):
|
||||
|
||||
pages: int | None = dataclasses.field(init=False)
|
||||
language: str = dataclasses.field(init=False)
|
||||
mode: str = dataclasses.field(init=False)
|
||||
skip_archive_file: str = dataclasses.field(init=False)
|
||||
mode: ModeChoices = dataclasses.field(init=False)
|
||||
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
|
||||
init=False,
|
||||
)
|
||||
image_dpi: int | None = dataclasses.field(init=False)
|
||||
clean: str = dataclasses.field(init=False)
|
||||
clean: CleanChoices = dataclasses.field(init=False)
|
||||
deskew: bool = dataclasses.field(init=False)
|
||||
rotate: bool = dataclasses.field(init=False)
|
||||
rotate_threshold: float = dataclasses.field(init=False)
|
||||
max_image_pixel: float | None = dataclasses.field(init=False)
|
||||
color_conversion_strategy: str = dataclasses.field(init=False)
|
||||
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
|
||||
user_args: dict[str, str] | None = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -64,8 +71,8 @@ class OcrConfig(OutputTypeConfig):
|
||||
self.pages = app_config.pages or settings.OCR_PAGES
|
||||
self.language = app_config.language or settings.OCR_LANGUAGE
|
||||
self.mode = app_config.mode or settings.OCR_MODE
|
||||
self.skip_archive_file = (
|
||||
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
self.archive_file_generation = (
|
||||
app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION
|
||||
)
|
||||
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
|
||||
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
|
||||
|
||||
90
src/paperless/migrations/0008_replace_skip_archive_file.py
Normal file
90
src/paperless/migrations/0008_replace_skip_archive_file.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# Generated by Django 5.2.12 on 2026-03-26 20:31
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
_MODE_MAP = {
|
||||
"skip": "auto",
|
||||
"redo": "redo",
|
||||
"force": "force",
|
||||
"skip_noarchive": "auto",
|
||||
}
|
||||
|
||||
_ARCHIVE_MAP = {
|
||||
# never skip -> always generate
|
||||
"never": "always",
|
||||
# skip when text present -> auto
|
||||
"with_text": "auto",
|
||||
# always skip -> never generate
|
||||
"always": "never",
|
||||
}
|
||||
|
||||
|
||||
def migrate_old_values(apps, schema_editor):
|
||||
ApplicationConfiguration = apps.get_model("paperless", "ApplicationConfiguration")
|
||||
for config in ApplicationConfiguration.objects.all():
|
||||
old_mode = config.mode
|
||||
old_skip = config.skip_archive_file
|
||||
|
||||
# Map the old mode value
|
||||
if old_mode in _MODE_MAP:
|
||||
config.mode = _MODE_MAP[old_mode]
|
||||
|
||||
# Map skip_archive_file -> archive_file_generation
|
||||
if old_skip in _ARCHIVE_MAP:
|
||||
config.archive_file_generation = _ARCHIVE_MAP[old_skip]
|
||||
|
||||
# skip_noarchive implied no archive file; set that if the user
|
||||
# didn't already have an explicit skip_archive_file preference
|
||||
if old_mode == "skip_noarchive" and old_skip is None:
|
||||
config.archive_file_generation = "never"
|
||||
|
||||
config.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0007_optimize_integer_field_sizes"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# 1. Update mode choices in-place (old values still in the column)
|
||||
migrations.AlterField(
|
||||
model_name="applicationconfiguration",
|
||||
name="mode",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("auto", "auto"),
|
||||
("force", "force"),
|
||||
("redo", "redo"),
|
||||
("off", "off"),
|
||||
],
|
||||
max_length=16,
|
||||
null=True,
|
||||
verbose_name="Sets the OCR mode",
|
||||
),
|
||||
),
|
||||
# 2. Add the new field
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="archive_file_generation",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("auto", "auto"), ("always", "always"), ("never", "never")],
|
||||
max_length=8,
|
||||
null=True,
|
||||
verbose_name="Controls archive file generation",
|
||||
),
|
||||
),
|
||||
# 3. Migrate data from old values to new
|
||||
migrations.RunPython(
|
||||
migrate_old_values,
|
||||
migrations.RunPython.noop,
|
||||
),
|
||||
# 4. Drop the old field
|
||||
migrations.RemoveField(
|
||||
model_name="applicationconfiguration",
|
||||
name="skip_archive_file",
|
||||
),
|
||||
]
|
||||
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
|
||||
and our own custom setting
|
||||
"""
|
||||
|
||||
SKIP = ("skip", _("skip"))
|
||||
REDO = ("redo", _("redo"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
FORCE = ("force", _("force"))
|
||||
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||
REDO = ("redo", _("redo"))
|
||||
OFF = ("off", _("off"))
|
||||
|
||||
|
||||
class ArchiveFileChoices(models.TextChoices):
|
||||
class ArchiveFileGenerationChoices(models.TextChoices):
|
||||
"""
|
||||
Settings to control creation of an archive PDF file
|
||||
"""
|
||||
|
||||
NEVER = ("never", _("never"))
|
||||
WITH_TEXT = ("with_text", _("with_text"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
NEVER = ("never", _("never"))
|
||||
|
||||
|
||||
class CleanChoices(models.TextChoices):
|
||||
@@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
choices=ModeChoices.choices,
|
||||
)
|
||||
|
||||
skip_archive_file = models.CharField(
|
||||
verbose_name=_("Controls the generation of an archive file"),
|
||||
archive_file_generation = models.CharField(
|
||||
verbose_name=_("Controls archive file generation"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=16,
|
||||
choices=ArchiveFileChoices.choices,
|
||||
max_length=8,
|
||||
choices=ArchiveFileGenerationChoices.choices,
|
||||
)
|
||||
|
||||
image_dpi = models.PositiveSmallIntegerField(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.resources
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -8,6 +9,8 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Final
|
||||
from typing import NoReturn
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
@@ -15,12 +18,16 @@ from PIL import Image
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
@@ -33,7 +40,11 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
_SRGB_ICC_DATA: Final[bytes] = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
@@ -99,7 +110,7 @@ class RasterisedDocumentParser:
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
def __init__(self, logging_group: object | None = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
@@ -233,7 +244,7 @@ class RasterisedDocumentParser:
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
and self.settings.mode != ModeChoices.REDO
|
||||
):
|
||||
text = read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
@@ -250,36 +261,7 @@ class RasterisedDocumentParser:
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
@@ -289,6 +271,7 @@ class RasterisedDocumentParser:
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback: bool = False,
|
||||
skip_text: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
@@ -307,15 +290,14 @@ class RasterisedDocumentParser:
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
elif skip_text or self.settings.mode == ModeChoices.OFF:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
pass # no extra flag: normal OCR (text not found case)
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
@@ -400,6 +382,115 @@ class RasterisedDocumentParser:
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
|
||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
||||
|
||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
||||
PDF/A-2b conformance metadata.
|
||||
|
||||
No Tesseract and no Ghostscript are invoked.
|
||||
"""
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
|
||||
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
|
||||
try:
|
||||
convert_kwargs: dict = {}
|
||||
if self.settings.image_dpi is not None:
|
||||
convert_kwargs["layout_fun"] = img2pdf.get_fixed_dpi_layout_fun(
|
||||
(self.settings.image_dpi, self.settings.image_dpi),
|
||||
)
|
||||
plain_pdf_path.write_bytes(
|
||||
img2pdf.convert(str(document_path), **convert_kwargs),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
||||
) from e
|
||||
|
||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
with pikepdf.open(plain_pdf_path) as pdf:
|
||||
cs = pdf.make_stream(_SRGB_ICC_DATA)
|
||||
cs["/N"] = 3
|
||||
output_intent = pikepdf.Dictionary(
|
||||
Type=pikepdf.Name("/OutputIntent"),
|
||||
S=pikepdf.Name("/GTS_PDFA1"),
|
||||
OutputConditionIdentifier=pikepdf.String("sRGB"),
|
||||
DestOutputProfile=cs,
|
||||
)
|
||||
pdf.Root["/OutputIntents"] = pdf.make_indirect(
|
||||
pikepdf.Array([output_intent]),
|
||||
)
|
||||
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
|
||||
meta["pdfaid:part"] = "2"
|
||||
meta["pdfaid:conformance"] = "B"
|
||||
pdf.save(pdfa_path)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
|
||||
)
|
||||
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
|
||||
|
||||
return pdfa_path
|
||||
|
||||
def _convert_pdf_to_pdfa(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Convert a PDF to PDF/A using Ghostscript directly, without OCR.
|
||||
|
||||
Respects the user's output_type, color_conversion_strategy, and
|
||||
continue_on_soft_render_error settings.
|
||||
"""
|
||||
from ocrmypdf._exec.ghostscript import generate_pdfa
|
||||
from ocrmypdf.pdfa import generate_pdfa_ps
|
||||
|
||||
output_type = self.settings.output_type
|
||||
if output_type == OutputTypeChoices.PDF:
|
||||
# No PDF/A requested — just copy the original
|
||||
copy_file_with_basic_stats(input_path, output_path)
|
||||
return
|
||||
|
||||
# Map output_type to pdfa_part: pdfa→2, pdfa-1→1, pdfa-2→2, pdfa-3→3
|
||||
pdfa_part = "2" if output_type == "pdfa" else output_type.split("-")[-1]
|
||||
|
||||
pdfmark = Path(self.tempdir) / "pdfa.ps"
|
||||
generate_pdfa_ps(pdfmark)
|
||||
|
||||
color_strategy = self.settings.color_conversion_strategy or "RGB"
|
||||
|
||||
self.log.debug(
|
||||
"Converting PDF to PDF/A-%s via Ghostscript (no OCR): %s",
|
||||
pdfa_part,
|
||||
input_path,
|
||||
)
|
||||
|
||||
generate_pdfa(
|
||||
pdf_pages=[pdfmark, input_path],
|
||||
output_file=output_path,
|
||||
compression="auto",
|
||||
color_conversion_strategy=color_strategy,
|
||||
pdfa_part=pdfa_part,
|
||||
)
|
||||
|
||||
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
|
||||
"""Log context for Ghostscript failures and raise ParseError.
|
||||
|
||||
Called from the SubprocessOutputError handlers in parse() to avoid
|
||||
duplicating the Ghostscript hint and re-raise logic.
|
||||
"""
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -409,57 +500,107 @@ class RasterisedDocumentParser:
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
|
||||
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
self.log.debug(
|
||||
"Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)",
|
||||
original_has_text,
|
||||
len(text_original) if text_original else 0,
|
||||
self.settings.mode,
|
||||
produce_archive,
|
||||
)
|
||||
|
||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
||||
if self.settings.mode == ModeChoices.OFF:
|
||||
if not produce_archive:
|
||||
self.log.debug(
|
||||
"OCR: skipped — OCR_MODE=off, no archive requested;"
|
||||
" returning pdftotext content only",
|
||||
)
|
||||
self.text = text_original or ""
|
||||
return
|
||||
if self.is_image(mime_type):
|
||||
self.log.debug(
|
||||
"OCR: skipped — OCR_MODE=off, image input;"
|
||||
" converting to PDF/A without OCR",
|
||||
)
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
)
|
||||
self.text = ""
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"Image to PDF/A conversion failed: {e!s}",
|
||||
) from e
|
||||
return
|
||||
# PDFs in off mode: PDF/A conversion via Ghostscript, no OCR
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
self._convert_pdf_to_pdfa(document_path, archive_path)
|
||||
self.archive_path = archive_path
|
||||
self.text = text_original or ""
|
||||
except SubprocessOutputError as e:
|
||||
self._handle_subprocess_output_error(e)
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
return
|
||||
|
||||
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
|
||||
if (
|
||||
self.settings.mode == ModeChoices.AUTO
|
||||
and original_has_text
|
||||
and not produce_archive
|
||||
):
|
||||
self.log.debug(
|
||||
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
|
||||
)
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# --- All other paths: run ocrmypdf ---
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
||||
|
||||
if skip_text:
|
||||
self.log.debug(
|
||||
"OCR strategy: PDF/A conversion only (skip_text)"
|
||||
" — OCR_MODE=auto, document already has text",
|
||||
)
|
||||
else:
|
||||
self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode)
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=skip_text,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
@@ -474,16 +615,8 @@ class RasterisedDocumentParser:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self._handle_subprocess_output_error(e)
|
||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
@@ -492,8 +625,6 @@ class RasterisedDocumentParser:
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
@@ -505,25 +636,18 @@ class RasterisedDocumentParser:
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path_fallback
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
|
||||
@@ -10,15 +10,105 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
||||
PDF_TEXT_MIN_LENGTH: Final[int] = 50
|
||||
|
||||
|
||||
def is_tagged_pdf(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> bool:
|
||||
"""Return True if the PDF declares itself as tagged (born-digital indicator).
|
||||
|
||||
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
|
||||
with ``/Marked true`` in the document root. This is a reliable signal
|
||||
that the document has a logical structure and embedded text — running OCR
|
||||
on it is unnecessary and archive generation can be skipped.
|
||||
|
||||
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with pikepdf.open(path) as pdf:
|
||||
mark_info = pdf.Root.get("/MarkInfo")
|
||||
if mark_info is None:
|
||||
return False
|
||||
return bool(mark_info.get("/Marked", False))
|
||||
except Exception:
|
||||
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def extract_pdf_text(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str | None:
|
||||
"""Run pdftotext on *path* and return the extracted text, or None on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
|
||||
"""
|
||||
from documents.utils import run_subprocess
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
out_path = Path(tmpdir) / "text.txt"
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(path),
|
||||
str(out_path),
|
||||
],
|
||||
logger=_log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(out_path, log=_log)
|
||||
return text or None
|
||||
except Exception:
|
||||
_log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
|
||||
@@ -889,10 +889,23 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
# OCRmyPDF --output-type options are available.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# skip. redo, force
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
if os.environ.get("PAPERLESS_OCR_MODE", "") in (
|
||||
"skip",
|
||||
"skip_noarchive",
|
||||
): # pragma: no cover
|
||||
OCR_MODE = "auto"
|
||||
else:
|
||||
OCR_MODE = get_choice_from_env(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
{"auto", "force", "redo", "off"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||
ARCHIVE_FILE_GENERATION = get_choice_from_env(
|
||||
"PAPERLESS_ARCHIVE_FILE_GENERATION",
|
||||
{"auto", "always", "never"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
|
||||
@@ -708,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
skip_archive_file=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
|
||||
141
src/paperless/tests/parsers/test_convert_image_to_pdfa.py
Normal file
141
src/paperless/tests/parsers/test_convert_image_to_pdfa.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Tests for RasterisedDocumentParser._convert_image_to_pdfa.
|
||||
|
||||
The method converts an image to a PDF/A-2b file using img2pdf (wrapping)
|
||||
then pikepdf (PDF/A metadata stamping), with a fallback to plain PDF when
|
||||
pikepdf stamping fails. No Tesseract or Ghostscript is invoked.
|
||||
|
||||
These are unit/integration tests: img2pdf and pikepdf run for real; only
|
||||
error-path branches mock the respective library call.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import img2pdf
|
||||
import magic
|
||||
import pikepdf
|
||||
import pytest
|
||||
|
||||
from documents.parsers import ParseError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestConvertImageToPdfa:
|
||||
"""_convert_image_to_pdfa: output shape, error paths, DPI handling."""
|
||||
|
||||
def test_valid_png_produces_pdf_bytes(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: a valid PNG with DPI metadata
|
||||
WHEN: _convert_image_to_pdfa is called
|
||||
THEN: the returned file is non-empty and begins with the PDF magic bytes
|
||||
"""
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
assert result.exists()
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
|
||||
def test_output_path_is_archive_pdf_in_tempdir(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: any valid image
|
||||
WHEN: _convert_image_to_pdfa is called
|
||||
THEN: the returned path is exactly <tempdir>/archive.pdf
|
||||
"""
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
assert result == Path(tesseract_parser.tempdir) / "archive.pdf"
|
||||
|
||||
def test_img2pdf_failure_raises_parse_error(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: img2pdf.convert raises an exception
|
||||
WHEN: _convert_image_to_pdfa is called
|
||||
THEN: a ParseError is raised that mentions "img2pdf conversion failed"
|
||||
"""
|
||||
mocker.patch.object(img2pdf, "convert", side_effect=Exception("boom"))
|
||||
|
||||
with pytest.raises(ParseError, match="img2pdf conversion failed"):
|
||||
tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
def test_pikepdf_stamping_failure_falls_back_to_plain_pdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: pikepdf.open raises during PDF/A metadata stamping
|
||||
WHEN: _convert_image_to_pdfa is called
|
||||
THEN: no exception is raised and the returned file is still a valid PDF
|
||||
(plain PDF bytes are used as fallback)
|
||||
"""
|
||||
mocker.patch.object(pikepdf, "open", side_effect=Exception("pikepdf boom"))
|
||||
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
assert result.exists()
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
|
||||
def test_image_dpi_setting_applies_fixed_dpi_layout(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_no_dpi_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: parser.settings.image_dpi = 150
|
||||
WHEN: _convert_image_to_pdfa is called with a no-DPI PNG
|
||||
THEN: img2pdf.get_fixed_dpi_layout_fun is called with (150, 150)
|
||||
and the output is still a valid PDF
|
||||
"""
|
||||
spy = mocker.patch.object(
|
||||
img2pdf,
|
||||
"get_fixed_dpi_layout_fun",
|
||||
wraps=img2pdf.get_fixed_dpi_layout_fun,
|
||||
)
|
||||
tesseract_parser.settings.image_dpi = 150
|
||||
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file)
|
||||
|
||||
spy.assert_called_once_with((150, 150))
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
|
||||
def test_no_image_dpi_setting_skips_fixed_dpi_layout(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN: parser.settings.image_dpi is None (default)
|
||||
WHEN: _convert_image_to_pdfa is called
|
||||
THEN: img2pdf.get_fixed_dpi_layout_fun is never called
|
||||
"""
|
||||
spy = mocker.patch.object(
|
||||
img2pdf,
|
||||
"get_fixed_dpi_layout_fun",
|
||||
wraps=img2pdf.get_fixed_dpi_layout_fun,
|
||||
)
|
||||
tesseract_parser.settings.image_dpi = None
|
||||
|
||||
tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
spy.assert_not_called()
|
||||
440
src/paperless/tests/parsers/test_parse_modes.py
Normal file
440
src/paperless/tests/parsers/test_parse_modes.py
Normal file
@@ -0,0 +1,440 @@
|
||||
"""
|
||||
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
|
||||
|
||||
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
|
||||
installation and execute quickly. The intent is to verify the *control flow*
|
||||
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
|
||||
not to test OCRmyPDF itself.
|
||||
|
||||
Fixtures are pulled from conftest.py in this package.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
|
||||
_SHORT_TEXT = "Hi." # <50 chars
|
||||
|
||||
|
||||
def _make_extract_text(text: str | None):
|
||||
"""Return a side_effect function for ``extract_text`` that returns *text*."""
|
||||
|
||||
def _extract(sidecar_file, pdf_file):
|
||||
return text
|
||||
|
||||
return _extract
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF with sufficient text layer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeWithText:
|
||||
"""AUTO mode, original PDF has detectable text (>50 chars)."""
|
||||
|
||||
def test_auto_text_no_archive_skips_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called (early return path)
|
||||
- archive_path remains None
|
||||
- text is set from the original
|
||||
"""
|
||||
# Patch extract_text to return long text (simulating detectable text layer)
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF without text layer (or too short)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeNoText:
|
||||
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
|
||||
|
||||
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with no text (or text <= VALID_TEXT_LENGTH)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
|
||||
- archive_path is set (since produce_archive=True)
|
||||
"""
|
||||
# Return "no text" for the original; return real text for archive
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None # original has no text
|
||||
return _LONG_TEXT # text from archive after OCR
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert "skip_text" not in call_kwargs
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
def test_auto_no_text_no_archive_calls_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with no text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called (no early return since no text detected)
|
||||
- archive_path is NOT set (produce_archive=False)
|
||||
"""
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None
|
||||
return _LONG_TEXT
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModePdf:
|
||||
"""OCR_MODE=off, document is a PDF."""
|
||||
|
||||
def test_off_no_archive_returns_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- PDF with text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text comes from pdftotext (extract_text)
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_off_with_archive_uses_ghostscript_not_ocr(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- PDF document
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- Ghostscript generate_pdfa IS called (PDF/A conversion without OCR)
|
||||
- archive_path is set
|
||||
- text comes from pdftotext, not OCR
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
mock_gs = mocker.patch(
|
||||
"ocrmypdf._exec.ghostscript.generate_pdfa",
|
||||
)
|
||||
mocker.patch("ocrmypdf.pdfa.generate_pdfa_ps")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
mock_gs.assert_called_once()
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — image
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModeImage:
|
||||
"""OCR_MODE=off, document is an image (PNG)."""
|
||||
|
||||
def test_off_image_no_archive_no_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text is empty string (images have no text layer)
|
||||
"""
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
def test_off_image_with_archive_uses_img2pdf_path(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
|
||||
- archive_path is set to the returned path
|
||||
- text is empty string
|
||||
"""
|
||||
fake_archive = Path("/tmp/fake-archive.pdf")
|
||||
mock_convert = mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"_convert_image_to_pdfa",
|
||||
return_value=fake_archive,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
||||
|
||||
mock_convert.assert_called_once_with(simple_png_file)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path == fake_archive
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProduceArchiveFalse:
|
||||
"""Verify produce_archive=False never results in an archive regardless of mode."""
|
||||
|
||||
@pytest.mark.parametrize("mode", ["force", "redo"])
|
||||
def test_produce_archive_false_force_redo_modes(
|
||||
self,
|
||||
mode: str,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- FORCE or REDO mode, produce_archive=False
|
||||
- Any PDF
|
||||
WHEN:
|
||||
- parse() is called (ocrmypdf mocked to succeed)
|
||||
THEN:
|
||||
- archive_path is NOT set even though ocrmypdf ran
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = mode
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() is not None
|
||||
|
||||
def test_produce_archive_false_auto_with_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf is skipped entirely (early return)
|
||||
- archive_path is None
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
@@ -94,15 +94,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
WHEN:
|
||||
- OCR parameters are constructed
|
||||
THEN:
|
||||
- Configuration from database is utilized
|
||||
- Configuration from database is utilized (AUTO mode with skip_text=True
|
||||
triggers skip_text; AUTO mode alone does not add any extra flag)
|
||||
"""
|
||||
# AUTO mode with skip_text=True explicitly passed: skip_text is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
skip_text=True,
|
||||
)
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
# AUTO mode alone (no skip_text): no extra OCR flag is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("skip_text", params)
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
|
||||
@@ -370,15 +370,26 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page digital PDF with sufficient text layer
|
||||
- Default settings (mode=auto, produce_archive=True)
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Archive is created (AUTO mode + text present + produce_archive=True
|
||||
→ PDF/A conversion via skip_text)
|
||||
- Text is extracted
|
||||
"""
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.archive_path.is_file()
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text(),
|
||||
["This is a test document."],
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_with_form_default(
|
||||
@@ -397,7 +408,7 @@ class TestParsePdf:
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
def test_with_form_redo_produces_no_archive(
|
||||
def test_with_form_redo_no_archive_when_not_requested(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
@@ -406,6 +417,7 @@ class TestParsePdf:
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "with-form.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -433,7 +445,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -449,7 +461,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "encrypted.pdf",
|
||||
"application/pdf",
|
||||
@@ -559,7 +571,7 @@ class TestParseMultiPage:
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[
|
||||
pytest.param("skip", id="skip"),
|
||||
pytest.param("auto", id="auto"),
|
||||
pytest.param("redo", id="redo"),
|
||||
pytest.param("force", id="force"),
|
||||
],
|
||||
@@ -587,7 +599,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -735,16 +747,18 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; no archive created
|
||||
- Text extracted from original; no archive created (text exists +
|
||||
produce_archive=False skips OCRmyPDF entirely)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -760,13 +774,13 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; archive created (OCR needed)
|
||||
- Text extracted; archive created (OCR needed, no existing text)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -778,41 +792,58 @@ class TestSkipArchive:
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
("produce_archive", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
True,
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="with-text-layer",
|
||||
True,
|
||||
id="produce-archive-with-text",
|
||||
),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
True,
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="with-text-no-layer",
|
||||
id="produce-archive-no-text",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
False,
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="always-with-text",
|
||||
id="no-archive-with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"multi-page-images.pdf",
|
||||
False,
|
||||
id="no-archive-no-text-layer",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||
],
|
||||
)
|
||||
def test_skip_archive_file_setting(
|
||||
def test_produce_archive_flag(
|
||||
self,
|
||||
skip_archive_file: str,
|
||||
produce_archive: bool, # noqa: FBT001
|
||||
filename: str,
|
||||
expect_archive: str,
|
||||
expect_archive: bool, # noqa: FBT001
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.skip_archive_file = skip_archive_file
|
||||
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||
"""
|
||||
GIVEN:
|
||||
- Various PDFs (with and without text layers)
|
||||
- produce_archive flag set to True or False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- archive_path is set if and only if produce_archive=True
|
||||
- Text is always extracted
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / filename,
|
||||
"application/pdf",
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
text = tesseract_parser.get_text().lower()
|
||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||
if expect_archive:
|
||||
@@ -820,6 +851,59 @@ class TestSkipArchive:
|
||||
else:
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
def test_tagged_pdf_skips_ocr_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
|
||||
- Text is extracted from the original via pdftotext
|
||||
- No archive is produced
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
||||
- Mode: auto, produce_archive=True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
|
||||
- Archive is produced
|
||||
- Text is preserved from the original
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.get_text()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse — mixed pages / sidecar
|
||||
@@ -835,13 +919,13 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in some pages (image) and some pages (digital)
|
||||
- Mode: skip
|
||||
- Mode: auto (skip_text), skip_archive_file: always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- All pages extracted; archive created; sidecar notes skipped pages
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -898,17 +982,18 @@ class TestParseMixed:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages
|
||||
- Mode: skip_noarchive
|
||||
- File with mixed pages (some with text, some image-only)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (file has text layer); later-page text present
|
||||
- No archive created (produce_archive=False); text from text layer present
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -923,12 +1008,12 @@ class TestParseMixed:
|
||||
|
||||
|
||||
class TestParseRotate:
|
||||
def test_rotate_skip_mode(
|
||||
def test_rotate_auto_mode(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.rotate = True
|
||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||
assert_ordered_substrings(
|
||||
@@ -955,12 +1040,19 @@ class TestParseRtl:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF with RTL Arabic text
|
||||
- PDF with RTL Arabic text in its text layer (short: 18 chars)
|
||||
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Arabic content is extracted (normalised for bidi)
|
||||
- Arabic content is extracted from the PDF text layer (normalised for bidi)
|
||||
|
||||
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
|
||||
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
|
||||
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
|
||||
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "rtl-test.pdf",
|
||||
"application/pdf",
|
||||
@@ -971,7 +1063,8 @@ class TestParseRtl:
|
||||
if unicodedata.category(ch) != "Cf" and not ch.isspace()
|
||||
)
|
||||
assert "ةرازو" in normalised
|
||||
assert any(token in normalised for token in ("ةیلخادلا", "الاخليد"))
|
||||
# pdftotext uses Arabic Yeh (U+064A) where ocrmypdf used Farsi Yeh (U+06CC)
|
||||
assert any(token in normalised for token in ("ةیلخادلا", "الاخليد", "ةيلخادال"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1023,11 +1116,11 @@ class TestOcrmypdfParameters:
|
||||
assert ("clean" in params) == expected_clean
|
||||
assert ("clean_final" in params) == expected_clean_final
|
||||
|
||||
def test_clean_final_skip_mode(
|
||||
def test_clean_final_auto_mode(
|
||||
self,
|
||||
make_tesseract_parser: MakeTesseractParser,
|
||||
) -> None:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
assert params["clean_final"] is True
|
||||
assert "clean" not in params
|
||||
@@ -1044,9 +1137,9 @@ class TestOcrmypdfParameters:
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
||||
[
|
||||
pytest.param("skip", True, True, id="skip-deskew-on"),
|
||||
pytest.param("auto", True, True, id="auto-deskew-on"),
|
||||
pytest.param("redo", True, False, id="redo-deskew-off"),
|
||||
pytest.param("skip", False, False, id="skip-no-deskew"),
|
||||
pytest.param("auto", False, False, id="auto-no-deskew"),
|
||||
],
|
||||
)
|
||||
def test_deskew_option(
|
||||
|
||||
@@ -132,13 +132,13 @@ class TestOcrSettingsChecks:
|
||||
pytest.param(
|
||||
"OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"deprecated",
|
||||
id="deprecated-mode",
|
||||
'OCR output mode "skip_noarchive"',
|
||||
id="deprecated-mode-now-invalid",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_SKIP_ARCHIVE_FILE",
|
||||
"ARCHIVE_FILE_GENERATION",
|
||||
"invalid",
|
||||
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
|
||||
'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"',
|
||||
id="invalid-skip-archive-file",
|
||||
),
|
||||
pytest.param(
|
||||
|
||||
64
src/paperless/tests/test_checks_v3.py
Normal file
64
src/paperless/tests/test_checks_v3.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Tests for v3 system checks: deprecated v2 OCR env var warnings."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.checks import check_deprecated_v2_ocr_env_vars
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
class TestDeprecatedV2OcrEnvVarWarnings:
|
||||
def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None:
|
||||
"""No warnings when neither deprecated variable is set."""
|
||||
mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("env_var", "env_value", "expected_id", "expected_fragment"),
|
||||
[
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
"always",
|
||||
"paperless.W002",
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
id="skip-archive-file-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip",
|
||||
"paperless.W003",
|
||||
"skip",
|
||||
id="ocr-mode-skip-warns",
|
||||
),
|
||||
pytest.param(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"paperless.W003",
|
||||
"skip_noarchive",
|
||||
id="ocr-mode-skip-noarchive-warns",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_deprecated_var_produces_one_warning(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
env_var: str,
|
||||
env_value: str,
|
||||
expected_id: str,
|
||||
expected_fragment: str,
|
||||
) -> None:
|
||||
"""Each deprecated setting in isolation produces exactly one warning."""
|
||||
mocker.patch.dict(os.environ, {env_var: env_value}, clear=True)
|
||||
result = check_deprecated_v2_ocr_env_vars(None)
|
||||
|
||||
assert len(result) == 1
|
||||
warning = result[0]
|
||||
assert warning.id == expected_id
|
||||
assert expected_fragment in warning.msg
|
||||
@@ -0,0 +1,89 @@
|
||||
from documents.tests.utils import TestMigrations
|
||||
|
||||
|
||||
class TestMigrateSkipArchiveFile(TestMigrations):
|
||||
migrate_from = "0007_optimize_integer_field_sizes"
|
||||
migrate_to = "0008_replace_skip_archive_file"
|
||||
|
||||
def setUpBeforeMigration(self, apps):
|
||||
ApplicationConfiguration = apps.get_model(
|
||||
"paperless",
|
||||
"ApplicationConfiguration",
|
||||
)
|
||||
ApplicationConfiguration.objects.all().delete()
|
||||
ApplicationConfiguration.objects.create(
|
||||
pk=1,
|
||||
mode="skip",
|
||||
skip_archive_file="always",
|
||||
)
|
||||
ApplicationConfiguration.objects.create(
|
||||
pk=2,
|
||||
mode="redo",
|
||||
skip_archive_file="with_text",
|
||||
)
|
||||
ApplicationConfiguration.objects.create(
|
||||
pk=3,
|
||||
mode="force",
|
||||
skip_archive_file="never",
|
||||
)
|
||||
ApplicationConfiguration.objects.create(
|
||||
pk=4,
|
||||
mode="skip_noarchive",
|
||||
skip_archive_file=None,
|
||||
)
|
||||
ApplicationConfiguration.objects.create(
|
||||
pk=5,
|
||||
mode="skip_noarchive",
|
||||
skip_archive_file="never",
|
||||
)
|
||||
ApplicationConfiguration.objects.create(pk=6, mode=None, skip_archive_file=None)
|
||||
|
||||
def _get_config(self, pk):
|
||||
ApplicationConfiguration = self.apps.get_model(
|
||||
"paperless",
|
||||
"ApplicationConfiguration",
|
||||
)
|
||||
return ApplicationConfiguration.objects.get(pk=pk)
|
||||
|
||||
def test_skip_mapped_to_auto(self):
|
||||
config = self._get_config(1)
|
||||
assert config.mode == "auto"
|
||||
|
||||
def test_skip_archive_always_mapped_to_never(self):
|
||||
config = self._get_config(1)
|
||||
assert config.archive_file_generation == "never"
|
||||
|
||||
def test_redo_unchanged(self):
|
||||
config = self._get_config(2)
|
||||
assert config.mode == "redo"
|
||||
|
||||
def test_skip_archive_with_text_mapped_to_auto(self):
|
||||
config = self._get_config(2)
|
||||
assert config.archive_file_generation == "auto"
|
||||
|
||||
def test_force_unchanged(self):
|
||||
config = self._get_config(3)
|
||||
assert config.mode == "force"
|
||||
|
||||
def test_skip_archive_never_mapped_to_always(self):
|
||||
config = self._get_config(3)
|
||||
assert config.archive_file_generation == "always"
|
||||
|
||||
def test_skip_noarchive_mapped_to_auto(self):
|
||||
config = self._get_config(4)
|
||||
assert config.mode == "auto"
|
||||
|
||||
def test_skip_noarchive_implies_archive_never(self):
|
||||
config = self._get_config(4)
|
||||
assert config.archive_file_generation == "never"
|
||||
|
||||
def test_skip_noarchive_explicit_skip_archive_takes_precedence(self):
|
||||
"""skip_archive_file=never maps to always, not overridden by skip_noarchive."""
|
||||
config = self._get_config(5)
|
||||
assert config.mode == "auto"
|
||||
assert config.archive_file_generation == "always"
|
||||
|
||||
def test_null_values_remain_null(self):
|
||||
config = self._get_config(6)
|
||||
assert config.mode is None
|
||||
assert config.archive_file_generation is None
|
||||
66
src/paperless/tests/test_ocr_config.py
Normal file
66
src/paperless/tests/test_ocr_config.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Tests for OcrConfig archive_file_generation field behavior."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless.config import OcrConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker) -> MagicMock:
|
||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
archive_file_generation=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_ocr_config(mocker, null_app_config):
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
def _make(**django_settings_overrides):
|
||||
with override_settings(**django_settings_overrides):
|
||||
return OcrConfig()
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
class TestOcrConfigArchiveFileGeneration:
|
||||
def test_auto_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto")
|
||||
assert cfg.archive_file_generation == "auto"
|
||||
|
||||
def test_always_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "always"
|
||||
|
||||
def test_never_from_settings(self, make_ocr_config) -> None:
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
|
||||
def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None:
|
||||
null_app_config.archive_file_generation = "never"
|
||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
||||
assert cfg.archive_file_generation == "never"
|
||||
25
src/paperless/tests/test_parser_utils.py
Normal file
25
src/paperless/tests/test_parser_utils.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Tests for paperless.parsers.utils helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
||||
|
||||
|
||||
class TestIsTaggedPdf:
|
||||
def test_tagged_pdf_returns_true(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
||||
|
||||
def test_untagged_pdf_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
|
||||
|
||||
def test_nonexistent_path_returns_false(self) -> None:
|
||||
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
|
||||
|
||||
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
|
||||
bad = tmp_path / "bad.pdf"
|
||||
bad.write_bytes(b"not a pdf")
|
||||
assert is_tagged_pdf(bad) is False
|
||||
96
uv.lock
generated
96
uv.lock
generated
@@ -4920,18 +4920,18 @@ dependencies = [
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin'" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:0826ac8e409551e12b2360ac18b4161a838cbd111933e694752f351191331d09" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7fbbf409143a4fe0812a40c0b46a436030a7e1d14fe8c5234dfbe44df47f617e" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:b39cafff7229699f9d6e172cac74d85fd71b568268e439e08d9c540e54732a3e" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:90821a3194b8806d9fa9fdaa9308c1bc73df0c26808274b14129a97c99f35794" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:358bd7125cbec6e692d60618a5eec7f55a51b29e3652a849fd42af021d818023" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:470de4176007c2700735e003a830828a88d27129032a3add07291da07e2a94e8" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:4584ab167995c0479f6821e3dceaf199c8166c811d3adbba5d8eedbbfa6764fd" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:45a1c5057629444aeb1c452c18298fa7f30f2f7aeadd4dc41f9d340980294407" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:339e05502b6c839db40e88720cb700f5a3b50cda332284873e851772d41b2c1e" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:840351da59cedb7bcbc51981880050813c19ef6b898a7fecf73a3afc71aff3fe" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:c88b1129fd4e14f0f882963c6728315caae35d2f47374d17edeed1edc7697497" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f4bea7dc451267c028593751612ad559299589304e68df54ae7672427893ff2c" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:0826ac8e409551e12b2360ac18b4161a838cbd111933e694752f351191331d09" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7fbbf409143a4fe0812a40c0b46a436030a7e1d14fe8c5234dfbe44df47f617e" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:b39cafff7229699f9d6e172cac74d85fd71b568268e439e08d9c540e54732a3e" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:90821a3194b8806d9fa9fdaa9308c1bc73df0c26808274b14129a97c99f35794" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:358bd7125cbec6e692d60618a5eec7f55a51b29e3652a849fd42af021d818023" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:470de4176007c2700735e003a830828a88d27129032a3add07291da07e2a94e8" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:4584ab167995c0479f6821e3dceaf199c8166c811d3adbba5d8eedbbfa6764fd" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:45a1c5057629444aeb1c452c18298fa7f30f2f7aeadd4dc41f9d340980294407" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:339e05502b6c839db40e88720cb700f5a3b50cda332284873e851772d41b2c1e" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:840351da59cedb7bcbc51981880050813c19ef6b898a7fecf73a3afc71aff3fe" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:c88b1129fd4e14f0f882963c6728315caae35d2f47374d17edeed1edc7697497" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f4bea7dc451267c028593751612ad559299589304e68df54ae7672427893ff2c" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4954,30 +4954,30 @@ dependencies = [
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'linux'" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_aarch64.whl", hash = "sha256:ce5c113d1f55f8c1f5af05047a24e50d11d293e0cbbb5bf7a75c6c761edd6eaa" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:0e286fcf6ce0cc7b204396c9b4ea0d375f1f0c3e752f68ce3d3aeb265511db8c" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1cfcb9b1558c6e52dffd0d4effce83b13c5ae5d97338164c372048c21f9cfccb" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b7cb1ec66cefb90fd7b676eac72cfda3b8d4e4d0cacd7a531963bc2e0a9710ab" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_aarch64.whl", hash = "sha256:8de5a36371b775e2d4881ed12cc7f2de400b1ad3d728aa74a281f649f87c9b8c" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_s390x.whl", hash = "sha256:9accc30b56cb6756d4a9d04fcb8ebc0bb68c7d55c1ed31a8657397d316d31596" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:179451716487f8cb09b56459667fa1f5c4c0946c1e75fbeae77cfc40a5768d87" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_aarch64.whl", hash = "sha256:fd215f3d0f681905c5b56b0630a3d666900a37fcc3ca5b937f95275c66f9fd9c" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_s390x.whl", hash = "sha256:170a0623108055be5199370335cf9b41ba6875b3cb6f086db4aee583331a4899" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e51994492cdb76edce29da88de3672a3022f9ef0ffd90345436948d4992be2c7" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8d316e5bf121f1eab1147e49ad0511a9d92e4c45cc357d1ab0bee440da71a095" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_aarch64.whl", hash = "sha256:5af75e5f49de21b0bdf7672bc27139bd285f9e8dbcabe2d617a2eb656514ac36" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_s390x.whl", hash = "sha256:ba51ef01a510baf8fff576174f702c47e1aa54389a9f1fba323bb1a5003ff0bf" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:0fedcb1a77e8f2aaf7bfd21591bf6d1e0b207473268c9be16b17cb7783253969" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:106dd1930cb30a4a337366ba3f9b25318ebf940f51fd46f789281dd9e736bdc4" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_aarch64.whl", hash = "sha256:ea2bcc9d1fca66974a71d4bf9a502539283f35d61fcab5a799b4e120846f1e02" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_s390x.whl", hash = "sha256:f8294fd2fc6dd8f4435a891a0122307a043b14b21f0dac1bca63c85bfb59e586" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:a28fdbcfa2fbacffec81300f24dd1bed2b0ccfdbed107a823cff12bc1db070f6" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:aada8afc068add586464b2a55adb7cc9091eec55caf5320447204741cb6a0604" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_aarch64.whl", hash = "sha256:9412bd37b70f5ebd1205242c4ba4cabae35a605947f2b30806d5c9b467936db9" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_s390x.whl", hash = "sha256:e71c476517c33e7db69825a9ff46c7f47a723ec4dac5b2481cff4246d1c632be" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:23882f8d882460aca809882fc42f5e343bf07585274f929ced00177d1be1eb67" },
|
||||
{ url = "https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4fcd8b4cc2ae20f2b7749fb275349c55432393868778c2d50a08e81d5ee5591e" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_aarch64.whl", hash = "sha256:ce5c113d1f55f8c1f5af05047a24e50d11d293e0cbbb5bf7a75c6c761edd6eaa" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:0e286fcf6ce0cc7b204396c9b4ea0d375f1f0c3e752f68ce3d3aeb265511db8c" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1cfcb9b1558c6e52dffd0d4effce83b13c5ae5d97338164c372048c21f9cfccb" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b7cb1ec66cefb90fd7b676eac72cfda3b8d4e4d0cacd7a531963bc2e0a9710ab" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_aarch64.whl", hash = "sha256:8de5a36371b775e2d4881ed12cc7f2de400b1ad3d728aa74a281f649f87c9b8c" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_s390x.whl", hash = "sha256:9accc30b56cb6756d4a9d04fcb8ebc0bb68c7d55c1ed31a8657397d316d31596" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:179451716487f8cb09b56459667fa1f5c4c0946c1e75fbeae77cfc40a5768d87" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_aarch64.whl", hash = "sha256:fd215f3d0f681905c5b56b0630a3d666900a37fcc3ca5b937f95275c66f9fd9c" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_s390x.whl", hash = "sha256:170a0623108055be5199370335cf9b41ba6875b3cb6f086db4aee583331a4899" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e51994492cdb76edce29da88de3672a3022f9ef0ffd90345436948d4992be2c7" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8d316e5bf121f1eab1147e49ad0511a9d92e4c45cc357d1ab0bee440da71a095" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_aarch64.whl", hash = "sha256:5af75e5f49de21b0bdf7672bc27139bd285f9e8dbcabe2d617a2eb656514ac36" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_s390x.whl", hash = "sha256:ba51ef01a510baf8fff576174f702c47e1aa54389a9f1fba323bb1a5003ff0bf" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:0fedcb1a77e8f2aaf7bfd21591bf6d1e0b207473268c9be16b17cb7783253969" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:106dd1930cb30a4a337366ba3f9b25318ebf940f51fd46f789281dd9e736bdc4" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_aarch64.whl", hash = "sha256:ea2bcc9d1fca66974a71d4bf9a502539283f35d61fcab5a799b4e120846f1e02" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_s390x.whl", hash = "sha256:f8294fd2fc6dd8f4435a891a0122307a043b14b21f0dac1bca63c85bfb59e586" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:a28fdbcfa2fbacffec81300f24dd1bed2b0ccfdbed107a823cff12bc1db070f6" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:aada8afc068add586464b2a55adb7cc9091eec55caf5320447204741cb6a0604" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_aarch64.whl", hash = "sha256:9412bd37b70f5ebd1205242c4ba4cabae35a605947f2b30806d5c9b467936db9" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_s390x.whl", hash = "sha256:e71c476517c33e7db69825a9ff46c7f47a723ec4dac5b2481cff4246d1c632be" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:23882f8d882460aca809882fc42f5e343bf07585274f929ced00177d1be1eb67" },
|
||||
{ url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4fcd8b4cc2ae20f2b7749fb275349c55432393868778c2d50a08e81d5ee5591e" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5710,7 +5710,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "zensical"
|
||||
version = "0.0.29"
|
||||
version = "0.0.31"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -5720,18 +5720,18 @@ dependencies = [
|
||||
{ name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/78/bd/5786ab618a60bd7469ab243a7fd2c9eecb0790c85c784abb8b97edb77a54/zensical-0.0.29.tar.gz", hash = "sha256:0d6282be7cb551e12d5806badf5e94c54a5e2f2cf07057a3e36d1eaf97c33ada", size = 3842641, upload-time = "2026-03-24T13:37:27.587Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d5/1a/9b6f5285c5aef648db38f9132f49a7059bd2c9d748f68ef0c52ed8afcff3/zensical-0.0.31.tar.gz", hash = "sha256:9c12f07bde70c4bfdb13d6cae1bedf8d18064d257a6e81128a152502b28a8fc3", size = 3891758, upload-time = "2026-04-01T11:30:21.88Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/9c/8b681daa024abca9763017bec09ecee8008e110cae1254217c8dd22cc339/zensical-0.0.29-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:20ae0709ea14fce25ab33d0a82acdaf454a7a2e232a9ee20c019942205174476", size = 12311399, upload-time = "2026-03-24T13:36:53.809Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/ae/4ebb4d8bb2ef0164d473698b92f11caf431fc436e1625524acd5641102ca/zensical-0.0.29-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:599af3ba66fcd0146d7019f3493ed3c316051fae6c4d5599bc59f3a8f4b8a6f0", size = 12191845, upload-time = "2026-03-24T13:36:56.909Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/35/67f89db06571a52283b3ecbe3bcf32fd3115ca50436b3ae177a948b83ea7/zensical-0.0.29-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eea7e48a00a71c0586e875079b5f83a070c33a147e52ad4383e4b63ab524332b", size = 12554105, upload-time = "2026-03-24T13:36:59.945Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/f6/ac79e5d9c18b28557c9ff1c7c23d695fbdd82645d69bfe02292f46d935e7/zensical-0.0.29-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59a57db35542e98d2896b833de07d199320f8ada3b4e7ddccb7fe892292d8b74", size = 12498643, upload-time = "2026-03-24T13:37:02.376Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/70/5c22a96a69e0e91e569c26236918bb9bab1170f59b29ad04105ead64f199/zensical-0.0.29-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d42c2b2a96a80cf64c98ba7242f59ef95109914bd4c9499d7ebc12544663852c", size = 12854531, upload-time = "2026-03-24T13:37:04.962Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/79/25/e32237a8fcb0ceae1ef8e192e7f8db53b38f1e48f1c7cdbacd0a7b713892/zensical-0.0.29-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b2fca39c5f6b1782c77cf6591cf346357cabee85ebdb956c5ddc0fd5169f3d9", size = 12596828, upload-time = "2026-03-24T13:37:07.817Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/74/89ac909cbb258903ea53802c184e4986c17ce0ba79b1c7f77b7e78a2dce3/zensical-0.0.29-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfc23a74ef672aa51088c080286319da1dc0b989cd5051e9e5e6d7d4abbc2fc1", size = 12732059, upload-time = "2026-03-24T13:37:11.651Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/31/2429de6a9328eed4acc7e9a3789f160294a15115be15f9870a0d02649302/zensical-0.0.29-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c9336d4e4b232e3c9a70e30258e916dd7e60c0a2a08c8690065e60350c302028", size = 12768542, upload-time = "2026-03-24T13:37:14.39Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/10/8a/55588b2a1dcbe86dad0404506c9ba367a06c663b1ff47147c84d26f7510e/zensical-0.0.29-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:30661148f0681199f3b598cbeb1d54f5cba773e54ae840bac639250d85907b84", size = 12917991, upload-time = "2026-03-24T13:37:16.795Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/5d/653901f0d3a3ca72daebc62746a148797f4e422cc3a2b66a4e6718e4398f/zensical-0.0.29-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6a566ac1fd4bfac5d711a7bd1ae06666712127c2718daa5083c7bf3f107e8578", size = 12868392, upload-time = "2026-03-24T13:37:19.42Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/db/cc4e555d2e816f2d91304ff969d62cc3a401ee477dbb7c720b874bec67d6/zensical-0.0.31-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b489936d670733dd204f16b689a2acc0e45b69e42cc4901f5131ae57658b8fbc", size = 12419980, upload-time = "2026-04-01T11:29:44.01Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/c1/6789f73164c7f5821f5defb8a80b1dba8d5af24bdec7db36876793c5afd9/zensical-0.0.31-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d9f678efc0d9918e45eeb8bc62847b2cce23db7393c8c59c1be6d1c064bbaacd", size = 12292301, upload-time = "2026-04-01T11:29:47.277Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4f/9a/6a83ad209081a953e0285d5056e5452c4fbcabd2f104f3797d53e4bdd96f/zensical-0.0.31-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b50ecf674997f818e53f12f2a67875a21b0c79ed74c151dfaef2f1475e5bf", size = 12661472, upload-time = "2026-04-01T11:29:50.706Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/4a/a82f5c81893b7a607cf9d439b75c3c3894b4ef4d3e92d5d818b4fa5c6f23/zensical-0.0.31-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6fb5c634fe88254770a2d4db5c05b06f1c3ee5e29d2ae3e7efdae8905e435b1d", size = 12603784, upload-time = "2026-04-01T11:29:53.623Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/1c/79c198628b8e006be32dfb1c5b73561757a349a6cf3069600a67ffa62495/zensical-0.0.31-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:94e64630552793274db1ec66c971e49a15ad351536d5d12de67ec6da7358ac50", size = 12959832, upload-time = "2026-04-01T11:29:56.736Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/9d/45839d9ca0f69622e8a3b944f2d8d7f7d2b7c2da78201079c4feb275feb6/zensical-0.0.31-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738a2fd5832e3b3c10ff642eebaf89c89ca1d28e4451dad0f36fdac53c415577", size = 12704024, upload-time = "2026-04-01T11:29:59.836Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/5f/451d7f4d94092bc38bd8d514826fb7b0329c188db506795b1d20bd07d517/zensical-0.0.31-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:bd601f6132e285ef6c3e4c3852be2094fc0473295a8080003db76a79760f84fb", size = 12837788, upload-time = "2026-04-01T11:30:03.048Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/39/390a8fc384fb174ebd4450343a0aa2877b3a31ddcedf5ef0b8d26944e12c/zensical-0.0.31-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:dc3b6a9dfb5903c0aa779ef65cd6185add2b8aa1db237be840874b8c9db761b8", size = 12876822, upload-time = "2026-04-01T11:30:06.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/60/640da2f095782cf38974cd851fb7afa62651d09a36543a1d8942b31aabdc/zensical-0.0.31-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:ddd4321b275e82c4897aa45b05038ce204b88fb311ad55f8c2af572173a9b56c", size = 13024036, upload-time = "2026-04-01T11:30:09.501Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/06/0564377cbfccea3653254adfa851c1b20d1696e4b16770c7b2e1dd1ef1d7/zensical-0.0.31-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:147ab4bc17f3088f703aa6c4b9c416411f4ea8ca64d26f6586beae49d97fd3c7", size = 12975505, upload-time = "2026-04-01T11:30:12.268Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
Reference in New Issue
Block a user