Compare commits
68 Commits
fix-bulk-e
...
dev
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b049ad9626 | ||
|
|
79def8a200 | ||
|
|
701735f6e5 | ||
|
|
07f54bfdab | ||
|
|
0f84af27d0 | ||
|
|
9646b8c67d | ||
|
|
e590d7df69 | ||
|
|
cc71aad058 | ||
|
|
3cbdf5d0b7 | ||
|
|
f84e0097e5 | ||
|
|
7dbf8bdd4a | ||
|
|
d2a752a196 | ||
|
|
2cb155e717 | ||
|
|
9e9fc6213c | ||
|
|
a9756f9462 | ||
|
|
c2b8b22fb4 | ||
|
|
d671e34559 | ||
|
|
f7c12d550a | ||
|
|
68fc898042 | ||
|
|
2cbe6ae892 | ||
|
|
b0bb31654f | ||
|
|
0f7c02de5e | ||
|
|
95dea787f2 | ||
|
|
b6501b0c47 | ||
|
|
d162c83eb7 | ||
|
|
d3ac75741f | ||
|
|
87ebd13abc | ||
|
|
3abff21d1f | ||
|
|
0a08499fc7 | ||
|
|
330ee696a8 | ||
|
|
b98697ab8b | ||
|
|
7e94dd8208 | ||
|
|
79da72f69c | ||
|
|
261ae9d8ce | ||
|
|
0e2c191524 | ||
|
|
ab4656692d | ||
|
|
03e2c352c2 | ||
|
|
2d46ed9692 | ||
|
|
8d23d17ae8 | ||
|
|
aea2927a02 | ||
|
|
a86c9d32fe | ||
|
|
d53dcad4f6 | ||
|
|
736b08ad09 | ||
|
|
ca5879a54e | ||
|
|
4d4f30b5f8 | ||
|
|
85fecac401 | ||
|
|
7942edfdf4 | ||
|
|
470018c011 | ||
|
|
54679a093a | ||
|
|
58ebcc21be | ||
|
|
1caa3eb8aa | ||
|
|
866c9fd858 | ||
|
|
2bb4af2be6 | ||
|
|
6b8ff9763d | ||
|
|
6034f17c87 | ||
|
|
48cd1cce6a | ||
|
|
1e00ad5f30 | ||
|
|
5f26c01c6f | ||
|
|
92e133eeb0 | ||
|
|
06b2d5102c | ||
|
|
9d69705e26 | ||
|
|
01abacab52 | ||
|
|
88b8f9b326 | ||
|
|
365ff99934 | ||
|
|
40255cfdbb | ||
|
|
d919c341b1 | ||
|
|
ba0a80a8ad | ||
|
|
60319c6d37 |
3
.github/dependabot.yml
vendored
@@ -157,6 +157,9 @@ updates:
|
|||||||
postgres:
|
postgres:
|
||||||
patterns:
|
patterns:
|
||||||
- "docker.io/library/postgres*"
|
- "docker.io/library/postgres*"
|
||||||
|
greenmail:
|
||||||
|
patterns:
|
||||||
|
- "docker.io/greenmail*"
|
||||||
- package-ecosystem: "pre-commit" # See documentation for possible values
|
- package-ecosystem: "pre-commit" # See documentation for possible values
|
||||||
directory: "/" # Location of package manifests
|
directory: "/" # Location of package manifests
|
||||||
schedule:
|
schedule:
|
||||||
|
|||||||
6
.github/workflows/ci-docker.yml
vendored
@@ -119,7 +119,7 @@ jobs:
|
|||||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||||
- name: Docker metadata
|
- name: Docker metadata
|
||||||
id: docker-meta
|
id: docker-meta
|
||||||
uses: docker/metadata-action@v5.10.0
|
uses: docker/metadata-action@v6.0.0
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
${{ env.REGISTRY }}/${{ steps.repo.outputs.name }}
|
${{ env.REGISTRY }}/${{ steps.repo.outputs.name }}
|
||||||
@@ -130,7 +130,7 @@ jobs:
|
|||||||
type=semver,pattern={{major}}.{{minor}}
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
- name: Build and push by digest
|
- name: Build and push by digest
|
||||||
id: build
|
id: build
|
||||||
uses: docker/build-push-action@v6.19.2
|
uses: docker/build-push-action@v7.0.0
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ./Dockerfile
|
file: ./Dockerfile
|
||||||
@@ -201,7 +201,7 @@ jobs:
|
|||||||
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
|
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
|
||||||
- name: Docker metadata
|
- name: Docker metadata
|
||||||
id: docker-meta
|
id: docker-meta
|
||||||
uses: docker/metadata-action@v5.10.0
|
uses: docker/metadata-action@v6.0.0
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
${{ env.REGISTRY }}/${{ needs.build-arch.outputs.repository }}
|
${{ env.REGISTRY }}/${{ needs.build-arch.outputs.repository }}
|
||||||
|
|||||||
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
|
|||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
||||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment]
|
|
||||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
|
||||||
src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment]
|
|
||||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ repos:
|
|||||||
- 'prettier-plugin-organize-imports@4.3.0'
|
- 'prettier-plugin-organize-imports@4.3.0'
|
||||||
# Python hooks
|
# Python hooks
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.15.5
|
rev: v0.15.6
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: ruff-check
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
|
|||||||
@@ -18,13 +18,13 @@ services:
|
|||||||
- "--log-level=warn"
|
- "--log-level=warn"
|
||||||
- "--log-format=text"
|
- "--log-format=text"
|
||||||
tika:
|
tika:
|
||||||
image: docker.io/apache/tika:latest
|
image: docker.io/apache/tika:3.2.3.0
|
||||||
hostname: tika
|
hostname: tika
|
||||||
container_name: tika
|
container_name: tika
|
||||||
network_mode: host
|
network_mode: host
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
greenmail:
|
greenmail:
|
||||||
image: greenmail/standalone:2.1.8
|
image: docker.io/greenmail/standalone:2.1.8
|
||||||
hostname: greenmail
|
hostname: greenmail
|
||||||
container_name: greenmail
|
container_name: greenmail
|
||||||
environment:
|
environment:
|
||||||
|
|||||||
@@ -2,6 +2,17 @@
|
|||||||
# shellcheck shell=bash
|
# shellcheck shell=bash
|
||||||
declare -r log_prefix="[init-user]"
|
declare -r log_prefix="[init-user]"
|
||||||
|
|
||||||
|
# When the container is started as a non-root user (e.g. via `user: 999:999`
|
||||||
|
# in Docker Compose), usermod/groupmod require root and are meaningless.
|
||||||
|
# USERMAP_* variables only apply to the root-started path.
|
||||||
|
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
|
||||||
|
if [[ -n "${USERMAP_UID}" || -n "${USERMAP_GID}" ]]; then
|
||||||
|
echo "${log_prefix} WARNING: USERMAP_UID/USERMAP_GID are set but have no effect when the container is started as a non-root user"
|
||||||
|
fi
|
||||||
|
echo "${log_prefix} Running as non-root user ($(id --user):$(id --group)), skipping UID/GID remapping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
declare -r usermap_original_uid=$(id -u paperless)
|
declare -r usermap_original_uid=$(id -u paperless)
|
||||||
declare -r usermap_original_gid=$(id -g paperless)
|
declare -r usermap_original_gid=$(id -g paperless)
|
||||||
declare -r usermap_new_uid=${USERMAP_UID:-$usermap_original_uid}
|
declare -r usermap_new_uid=${USERMAP_UID:-$usermap_original_uid}
|
||||||
|
|||||||
@@ -1,5 +1,56 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## paperless-ngx 2.20.12
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- Resolve [GHSA-96jx-fj7m-qh6x](https://github.com/paperless-ngx/paperless-ngx/security/advisories/GHSA-96jx-fj7m-qh6x)
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
- Fix: Scope the workflow saves to prevent clobbering filename/archive_filename [@stumpylog](https://github.com/stumpylog) ([#12390](https://github.com/paperless-ngx/paperless-ngx/pull/12390))
|
||||||
|
- Fix: don't try to usermod/groupmod when non-root + update docs (#<!---->12365) [@stumpylog](https://github.com/stumpylog) ([#12391](https://github.com/paperless-ngx/paperless-ngx/pull/12391))
|
||||||
|
- Fix: avoid moving files if already moved [@shamoon](https://github.com/shamoon) ([#12389](https://github.com/paperless-ngx/paperless-ngx/pull/12389))
|
||||||
|
- Fix: remove pagination from document notes api spec [@shamoon](https://github.com/shamoon) ([#12388](https://github.com/paperless-ngx/paperless-ngx/pull/12388))
|
||||||
|
- Fix: fix file button hover color in dark mode [@shamoon](https://github.com/shamoon) ([#12367](https://github.com/paperless-ngx/paperless-ngx/pull/12367))
|
||||||
|
- Fixhancement: only offer basic auth for appropriate requests [@shamoon](https://github.com/shamoon) ([#12362](https://github.com/paperless-ngx/paperless-ngx/pull/12362))
|
||||||
|
|
||||||
|
### All App Changes
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>5 changes</summary>
|
||||||
|
|
||||||
|
- Fix: Scope the workflow saves to prevent clobbering filename/archive_filename [@stumpylog](https://github.com/stumpylog) ([#12390](https://github.com/paperless-ngx/paperless-ngx/pull/12390))
|
||||||
|
- Fix: avoid moving files if already moved [@shamoon](https://github.com/shamoon) ([#12389](https://github.com/paperless-ngx/paperless-ngx/pull/12389))
|
||||||
|
- Fix: remove pagination from document notes api spec [@shamoon](https://github.com/shamoon) ([#12388](https://github.com/paperless-ngx/paperless-ngx/pull/12388))
|
||||||
|
- Fix: fix file button hover color in dark mode [@shamoon](https://github.com/shamoon) ([#12367](https://github.com/paperless-ngx/paperless-ngx/pull/12367))
|
||||||
|
- Fixhancement: only offer basic auth for appropriate requests [@shamoon](https://github.com/shamoon) ([#12362](https://github.com/paperless-ngx/paperless-ngx/pull/12362))
|
||||||
|
</details>
|
||||||
|
|
||||||
|
## paperless-ngx 2.20.11
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- Resolve [GHSA-59xh-5vwx-4c4q](https://github.com/paperless-ngx/paperless-ngx/security/advisories/GHSA-59xh-5vwx-4c4q)
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
- Fix: correct dropdown list active color in dark mode [@shamoon](https://github.com/shamoon) ([#12328](https://github.com/paperless-ngx/paperless-ngx/pull/12328))
|
||||||
|
- Fixhancement: clear descendant selections in dropdown when parent toggled [@shamoon](https://github.com/shamoon) ([#12326](https://github.com/paperless-ngx/paperless-ngx/pull/12326))
|
||||||
|
- Fix: prevent wrapping with larger amounts of tags on small cards, reset moreTags setting to correct count [@shamoon](https://github.com/shamoon) ([#12302](https://github.com/paperless-ngx/paperless-ngx/pull/12302))
|
||||||
|
- Fix: prevent stale db filename during workflow actions [@shamoon](https://github.com/shamoon) ([#12289](https://github.com/paperless-ngx/paperless-ngx/pull/12289))
|
||||||
|
|
||||||
|
### All App Changes
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>4 changes</summary>
|
||||||
|
|
||||||
|
- Fix: correct dropdown list active color in dark mode [@shamoon](https://github.com/shamoon) ([#12328](https://github.com/paperless-ngx/paperless-ngx/pull/12328))
|
||||||
|
- Fixhancement: clear descendant selections in dropdown when parent toggled [@shamoon](https://github.com/shamoon) ([#12326](https://github.com/paperless-ngx/paperless-ngx/pull/12326))
|
||||||
|
- Fix: prevent wrapping with larger amounts of tags on small cards, reset moreTags setting to correct count [@shamoon](https://github.com/shamoon) ([#12302](https://github.com/paperless-ngx/paperless-ngx/pull/12302))
|
||||||
|
- Fix: prevent stale db filename during workflow actions [@shamoon](https://github.com/shamoon) ([#12289](https://github.com/paperless-ngx/paperless-ngx/pull/12289))
|
||||||
|
</details>
|
||||||
|
|
||||||
## paperless-ngx 2.20.10
|
## paperless-ngx 2.20.10
|
||||||
|
|
||||||
### Bug Fixes
|
### Bug Fixes
|
||||||
|
|||||||
@@ -674,6 +674,9 @@ See the corresponding [django-allauth documentation](https://docs.allauth.org/en
|
|||||||
for a list of provider configurations. You will also need to include the relevant Django 'application' inside the
|
for a list of provider configurations. You will also need to include the relevant Django 'application' inside the
|
||||||
[PAPERLESS_APPS](#PAPERLESS_APPS) setting to activate that specific authentication provider (e.g. `allauth.socialaccount.providers.openid_connect` for the [OIDC Connect provider](https://docs.allauth.org/en/latest/socialaccount/providers/openid_connect.html)).
|
[PAPERLESS_APPS](#PAPERLESS_APPS) setting to activate that specific authentication provider (e.g. `allauth.socialaccount.providers.openid_connect` for the [OIDC Connect provider](https://docs.allauth.org/en/latest/socialaccount/providers/openid_connect.html)).
|
||||||
|
|
||||||
|
: For OpenID Connect providers, set `settings.token_auth_method` if your identity provider
|
||||||
|
requires a specific token endpoint authentication method.
|
||||||
|
|
||||||
Defaults to None, which does not enable any third party authentication systems.
|
Defaults to None, which does not enable any third party authentication systems.
|
||||||
|
|
||||||
#### [`PAPERLESS_SOCIAL_AUTO_SIGNUP=<bool>`](#PAPERLESS_SOCIAL_AUTO_SIGNUP) {#PAPERLESS_SOCIAL_AUTO_SIGNUP}
|
#### [`PAPERLESS_SOCIAL_AUTO_SIGNUP=<bool>`](#PAPERLESS_SOCIAL_AUTO_SIGNUP) {#PAPERLESS_SOCIAL_AUTO_SIGNUP}
|
||||||
@@ -1947,6 +1950,12 @@ current backend. If not supplied, defaults to "gpt-3.5-turbo" for OpenAI and "ll
|
|||||||
|
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
|
||||||
|
#### [`PAPERLESS_AI_LLM_ALLOW_INTERNAL_ENDPOINTS=<bool>`](#PAPERLESS_AI_LLM_ALLOW_INTERNAL_ENDPOINTS) {#PAPERLESS_AI_LLM_ALLOW_INTERNAL_ENDPOINTS}
|
||||||
|
|
||||||
|
: If set to false, Paperless blocks AI endpoint URLs that resolve to non-public addresses (e.g., localhost, etc).
|
||||||
|
|
||||||
|
Defaults to true, which allows internal endpoints.
|
||||||
|
|
||||||
#### [`PAPERLESS_AI_LLM_INDEX_TASK_CRON=<cron expression>`](#PAPERLESS_AI_LLM_INDEX_TASK_CRON) {#PAPERLESS_AI_LLM_INDEX_TASK_CRON}
|
#### [`PAPERLESS_AI_LLM_INDEX_TASK_CRON=<cron expression>`](#PAPERLESS_AI_LLM_INDEX_TASK_CRON) {#PAPERLESS_AI_LLM_INDEX_TASK_CRON}
|
||||||
|
|
||||||
: Configures the schedule to update the AI embeddings of text content and metadata for all documents. Only performed if
|
: Configures the schedule to update the AI embeddings of text content and metadata for all documents. Only performed if
|
||||||
|
|||||||
@@ -103,3 +103,30 @@ Multiple options are combined in a single value:
|
|||||||
```bash
|
```bash
|
||||||
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## OpenID Connect Token Endpoint Authentication
|
||||||
|
|
||||||
|
Some existing OpenID Connect setups may require an explicit token endpoint authentication method after upgrading to v3.
|
||||||
|
|
||||||
|
#### Action Required
|
||||||
|
|
||||||
|
If OIDC login fails at the callback with an `invalid_client` error, add `token_auth_method` to the provider `settings` in
|
||||||
|
[`PAPERLESS_SOCIALACCOUNT_PROVIDERS`](configuration.md#PAPERLESS_SOCIALACCOUNT_PROVIDERS).
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"openid_connect": {
|
||||||
|
"APPS": [
|
||||||
|
{
|
||||||
|
...
|
||||||
|
"settings": {
|
||||||
|
"server_url": "https://login.example.com",
|
||||||
|
"token_auth_method": "client_secret_basic"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|||||||
@@ -140,24 +140,17 @@ a [superuser](usage.md#superusers) account.
|
|||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
|
|
||||||
It is currently not possible to run the container rootless if additional languages are specified via `PAPERLESS_OCR_LANGUAGES`.
|
It is not possible to run the container rootless if additional languages are specified via `PAPERLESS_OCR_LANGUAGES`.
|
||||||
|
|
||||||
If you want to run Paperless as a rootless container, make this
|
If you want to run Paperless as a rootless container, set `user:` in `docker-compose.yml` to the UID and GID of your host user (use `id -u` and `id -g` to find these values). The container process starts directly as that user with no internal privilege remapping:
|
||||||
change in `docker-compose.yml`:
|
|
||||||
|
|
||||||
- Set the `user` running the container to map to the `paperless`
|
```yaml
|
||||||
user in the container. This value (`user_id` below) should be
|
webserver:
|
||||||
the same ID that `USERMAP_UID` and `USERMAP_GID` are set to in
|
image: ghcr.io/paperless-ngx/paperless-ngx:latest
|
||||||
`docker-compose.env`. See `USERMAP_UID` and `USERMAP_GID`
|
user: '1000:1000'
|
||||||
[here](configuration.md#docker).
|
```
|
||||||
|
|
||||||
Your entry for Paperless should contain something like:
|
Do not combine this with `USERMAP_UID` or `USERMAP_GID`, which are intended for the non-rootless case described in step 3.
|
||||||
|
|
||||||
> ```
|
|
||||||
> webserver:
|
|
||||||
> image: ghcr.io/paperless-ngx/paperless-ngx:latest
|
|
||||||
> user: <user_id>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
**File systems without inotify support (e.g. NFS)**
|
**File systems without inotify support (e.g. NFS)**
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "paperless-ngx"
|
name = "paperless-ngx"
|
||||||
version = "2.20.10"
|
version = "2.20.13"
|
||||||
description = "A community-supported supercharged document management system: scan, index and archive all your physical documents"
|
description = "A community-supported supercharged document management system: scan, index and archive all your physical documents"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
@@ -26,7 +26,7 @@ dependencies = [
|
|||||||
# WARNING: django does not use semver.
|
# WARNING: django does not use semver.
|
||||||
# Only patch versions are guaranteed to not introduce breaking changes.
|
# Only patch versions are guaranteed to not introduce breaking changes.
|
||||||
"django~=5.2.10",
|
"django~=5.2.10",
|
||||||
"django-allauth[mfa,socialaccount]~=65.14.0",
|
"django-allauth[mfa,socialaccount]~=65.15.0",
|
||||||
"django-auditlog~=3.4.1",
|
"django-auditlog~=3.4.1",
|
||||||
"django-cachalot~=2.9.0",
|
"django-cachalot~=2.9.0",
|
||||||
"django-celery-results~=2.6.0",
|
"django-celery-results~=2.6.0",
|
||||||
@@ -60,7 +60,7 @@ dependencies = [
|
|||||||
"llama-index-llms-openai>=0.6.13",
|
"llama-index-llms-openai>=0.6.13",
|
||||||
"llama-index-vector-stores-faiss>=0.5.2",
|
"llama-index-vector-stores-faiss>=0.5.2",
|
||||||
"nltk~=3.9.1",
|
"nltk~=3.9.1",
|
||||||
"ocrmypdf~=16.13.0",
|
"ocrmypdf~=17.3.0",
|
||||||
"openai>=1.76",
|
"openai>=1.76",
|
||||||
"pathvalidate~=3.3.1",
|
"pathvalidate~=3.3.1",
|
||||||
"pdf2image~=1.17.0",
|
"pdf2image~=1.17.0",
|
||||||
@@ -248,15 +248,13 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
|
|||||||
lint.per-file-ignores."src/documents/models.py" = [
|
lint.per-file-ignores."src/documents/models.py" = [
|
||||||
"SIM115",
|
"SIM115",
|
||||||
]
|
]
|
||||||
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
|
|
||||||
"RUF001",
|
|
||||||
]
|
|
||||||
lint.isort.force-single-line = true
|
lint.isort.force-single-line = true
|
||||||
|
|
||||||
[tool.codespell]
|
[tool.codespell]
|
||||||
write-changes = true
|
write-changes = true
|
||||||
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
||||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
|
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
|
||||||
|
|
||||||
[tool.pytest]
|
[tool.pytest]
|
||||||
minversion = "9.0"
|
minversion = "9.0"
|
||||||
@@ -271,10 +269,6 @@ testpaths = [
|
|||||||
"src/documents/tests/",
|
"src/documents/tests/",
|
||||||
"src/paperless/tests/",
|
"src/paperless/tests/",
|
||||||
"src/paperless_mail/tests/",
|
"src/paperless_mail/tests/",
|
||||||
"src/paperless_tesseract/tests/",
|
|
||||||
"src/paperless_tika/tests",
|
|
||||||
"src/paperless_text/tests/",
|
|
||||||
"src/paperless_remote/tests/",
|
|
||||||
"src/paperless_ai/tests",
|
"src/paperless_ai/tests",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -5,14 +5,14 @@
|
|||||||
<trans-unit id="ngb.alert.close" datatype="html">
|
<trans-unit id="ngb.alert.close" datatype="html">
|
||||||
<source>Close</source>
|
<source>Close</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/alert/alert.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/alert/alert.ts</context>
|
||||||
<context context-type="linenumber">50</context>
|
<context context-type="linenumber">50</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.carousel.slide-number" datatype="html">
|
<trans-unit id="ngb.carousel.slide-number" datatype="html">
|
||||||
<source> Slide <x id="INTERPOLATION" equiv-text="ueryList<NgbSli"/> of <x id="INTERPOLATION_1" equiv-text="EventSource = N"/> </source>
|
<source> Slide <x id="INTERPOLATION" equiv-text="ueryList<NgbSli"/> of <x id="INTERPOLATION_1" equiv-text="EventSource = N"/> </source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/carousel/carousel.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/carousel/carousel.ts</context>
|
||||||
<context context-type="linenumber">131,135</context>
|
<context context-type="linenumber">131,135</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<note priority="1" from="description">Currently selected slide number read by screen reader</note>
|
<note priority="1" from="description">Currently selected slide number read by screen reader</note>
|
||||||
@@ -20,114 +20,114 @@
|
|||||||
<trans-unit id="ngb.carousel.previous" datatype="html">
|
<trans-unit id="ngb.carousel.previous" datatype="html">
|
||||||
<source>Previous</source>
|
<source>Previous</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/carousel/carousel.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/carousel/carousel.ts</context>
|
||||||
<context context-type="linenumber">159,162</context>
|
<context context-type="linenumber">159,162</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.carousel.next" datatype="html">
|
<trans-unit id="ngb.carousel.next" datatype="html">
|
||||||
<source>Next</source>
|
<source>Next</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/carousel/carousel.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/carousel/carousel.ts</context>
|
||||||
<context context-type="linenumber">202,203</context>
|
<context context-type="linenumber">202,203</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.datepicker.select-month" datatype="html">
|
<trans-unit id="ngb.datepicker.select-month" datatype="html">
|
||||||
<source>Select month</source>
|
<source>Select month</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
||||||
<context context-type="linenumber">91</context>
|
<context context-type="linenumber">91</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
||||||
<context context-type="linenumber">91</context>
|
<context context-type="linenumber">91</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.datepicker.select-year" datatype="html">
|
<trans-unit id="ngb.datepicker.select-year" datatype="html">
|
||||||
<source>Select year</source>
|
<source>Select year</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
||||||
<context context-type="linenumber">91</context>
|
<context context-type="linenumber">91</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation-select.ts</context>
|
||||||
<context context-type="linenumber">91</context>
|
<context context-type="linenumber">91</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.datepicker.previous-month" datatype="html">
|
<trans-unit id="ngb.datepicker.previous-month" datatype="html">
|
||||||
<source>Previous month</source>
|
<source>Previous month</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
||||||
<context context-type="linenumber">83,85</context>
|
<context context-type="linenumber">83,85</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
||||||
<context context-type="linenumber">112</context>
|
<context context-type="linenumber">112</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.datepicker.next-month" datatype="html">
|
<trans-unit id="ngb.datepicker.next-month" datatype="html">
|
||||||
<source>Next month</source>
|
<source>Next month</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
||||||
<context context-type="linenumber">112</context>
|
<context context-type="linenumber">112</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/datepicker/datepicker-navigation.ts</context>
|
||||||
<context context-type="linenumber">112</context>
|
<context context-type="linenumber">112</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.first" datatype="html">
|
<trans-unit id="ngb.pagination.first" datatype="html">
|
||||||
<source>««</source>
|
<source>««</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.previous" datatype="html">
|
<trans-unit id="ngb.pagination.previous" datatype="html">
|
||||||
<source>«</source>
|
<source>«</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.next" datatype="html">
|
<trans-unit id="ngb.pagination.next" datatype="html">
|
||||||
<source>»</source>
|
<source>»</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.last" datatype="html">
|
<trans-unit id="ngb.pagination.last" datatype="html">
|
||||||
<source>»»</source>
|
<source>»»</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.first-aria" datatype="html">
|
<trans-unit id="ngb.pagination.first-aria" datatype="html">
|
||||||
<source>First</source>
|
<source>First</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.previous-aria" datatype="html">
|
<trans-unit id="ngb.pagination.previous-aria" datatype="html">
|
||||||
<source>Previous</source>
|
<source>Previous</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.next-aria" datatype="html">
|
<trans-unit id="ngb.pagination.next-aria" datatype="html">
|
||||||
<source>Next</source>
|
<source>Next</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.pagination.last-aria" datatype="html">
|
<trans-unit id="ngb.pagination.last-aria" datatype="html">
|
||||||
<source>Last</source>
|
<source>Last</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/pagination/pagination-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/pagination/pagination-config.ts</context>
|
||||||
<context context-type="linenumber">20</context>
|
<context context-type="linenumber">20</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
@@ -135,105 +135,105 @@
|
|||||||
<source><x id="INTERPOLATION" equiv-text="barConfig);
|
<source><x id="INTERPOLATION" equiv-text="barConfig);
|
||||||
pu"/></source>
|
pu"/></source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/progressbar/progressbar.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/progressbar/progressbar.ts</context>
|
||||||
<context context-type="linenumber">41,42</context>
|
<context context-type="linenumber">41,42</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.HH" datatype="html">
|
<trans-unit id="ngb.timepicker.HH" datatype="html">
|
||||||
<source>HH</source>
|
<source>HH</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.hours" datatype="html">
|
<trans-unit id="ngb.timepicker.hours" datatype="html">
|
||||||
<source>Hours</source>
|
<source>Hours</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.MM" datatype="html">
|
<trans-unit id="ngb.timepicker.MM" datatype="html">
|
||||||
<source>MM</source>
|
<source>MM</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.minutes" datatype="html">
|
<trans-unit id="ngb.timepicker.minutes" datatype="html">
|
||||||
<source>Minutes</source>
|
<source>Minutes</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.increment-hours" datatype="html">
|
<trans-unit id="ngb.timepicker.increment-hours" datatype="html">
|
||||||
<source>Increment hours</source>
|
<source>Increment hours</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.decrement-hours" datatype="html">
|
<trans-unit id="ngb.timepicker.decrement-hours" datatype="html">
|
||||||
<source>Decrement hours</source>
|
<source>Decrement hours</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.increment-minutes" datatype="html">
|
<trans-unit id="ngb.timepicker.increment-minutes" datatype="html">
|
||||||
<source>Increment minutes</source>
|
<source>Increment minutes</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.decrement-minutes" datatype="html">
|
<trans-unit id="ngb.timepicker.decrement-minutes" datatype="html">
|
||||||
<source>Decrement minutes</source>
|
<source>Decrement minutes</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.SS" datatype="html">
|
<trans-unit id="ngb.timepicker.SS" datatype="html">
|
||||||
<source>SS</source>
|
<source>SS</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.seconds" datatype="html">
|
<trans-unit id="ngb.timepicker.seconds" datatype="html">
|
||||||
<source>Seconds</source>
|
<source>Seconds</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.increment-seconds" datatype="html">
|
<trans-unit id="ngb.timepicker.increment-seconds" datatype="html">
|
||||||
<source>Increment seconds</source>
|
<source>Increment seconds</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.decrement-seconds" datatype="html">
|
<trans-unit id="ngb.timepicker.decrement-seconds" datatype="html">
|
||||||
<source>Decrement seconds</source>
|
<source>Decrement seconds</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.timepicker.PM" datatype="html">
|
<trans-unit id="ngb.timepicker.PM" datatype="html">
|
||||||
<source><x id="INTERPOLATION"/></source>
|
<source><x id="INTERPOLATION"/></source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/timepicker/timepicker-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/timepicker/timepicker-config.ts</context>
|
||||||
<context context-type="linenumber">21</context>
|
<context context-type="linenumber">21</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="ngb.toast.close-aria" datatype="html">
|
<trans-unit id="ngb.toast.close-aria" datatype="html">
|
||||||
<source>Close</source>
|
<source>Close</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.0_@angular+core@21.2.0_@angular+_fdecb2f5429dfeda6301fd300107de5b/node_modules/src/toast/toast-config.ts</context>
|
<context context-type="sourcefile">node_modules/.pnpm/@ng-bootstrap+ng-bootstrap@20.0.0_@angular+common@21.2.4_@angular+core@21.2.4_@angular+_a674c967733fd102e5fef61ea5e6b837/node_modules/src/toast/toast-config.ts</context>
|
||||||
<context context-type="linenumber">54</context>
|
<context context-type="linenumber">54</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
@@ -532,15 +532,79 @@
|
|||||||
<context context-type="linenumber">125</context>
|
<context context-type="linenumber">125</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="3823219296477075982" datatype="html">
|
<trans-unit id="2159130950882492111" datatype="html">
|
||||||
<source>Discard</source>
|
<source>Cancel</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/app/components/admin/config/config.component.html</context>
|
<context context-type="sourcefile">src/app/components/admin/config/config.component.html</context>
|
||||||
<context context-type="linenumber">62</context>
|
<context context-type="linenumber">62</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/app/components/document-detail/document-detail.component.html</context>
|
<context context-type="sourcefile">src/app/components/admin/settings/settings.component.html</context>
|
||||||
<context context-type="linenumber">452</context>
|
<context context-type="linenumber">399</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/confirm-dialog/confirm-dialog.component.ts</context>
|
||||||
|
<context context-type="linenumber">47</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/correspondent-edit-dialog/correspondent-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">25</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/custom-field-edit-dialog/custom-field-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">51</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/document-type-edit-dialog/document-type-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">27</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/group-edit-dialog/group-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">19</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/mail-account-edit-dialog/mail-account-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">39</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">80</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">76</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/tag-edit-dialog/tag-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">30</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/user-edit-dialog/user-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">56</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/edit-dialog/workflow-edit-dialog/workflow-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">115</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/permissions-dialog/permissions-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">31</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/common/profile-edit-dialog/profile-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">182</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/document-list/bulk-editor/custom-fields-bulk-edit-dialog/custom-fields-bulk-edit-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">81</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/document-list/save-view-config-dialog/save-view-config-dialog.component.html</context>
|
||||||
|
<context context-type="linenumber">21</context>
|
||||||
|
</context-group>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
|
||||||
|
<context context-type="linenumber">82</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="3768927257183755959" datatype="html">
|
<trans-unit id="3768927257183755959" datatype="html">
|
||||||
@@ -1514,77 +1578,6 @@
|
|||||||
<context context-type="linenumber">389</context>
|
<context context-type="linenumber">389</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="2159130950882492111" datatype="html">
|
|
||||||
<source>Cancel</source>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/admin/settings/settings.component.html</context>
|
|
||||||
<context context-type="linenumber">399</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/confirm-dialog/confirm-dialog.component.ts</context>
|
|
||||||
<context context-type="linenumber">47</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/correspondent-edit-dialog/correspondent-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">25</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/custom-field-edit-dialog/custom-field-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">51</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/document-type-edit-dialog/document-type-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">27</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/group-edit-dialog/group-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">19</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/mail-account-edit-dialog/mail-account-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">39</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">80</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">76</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/tag-edit-dialog/tag-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">30</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/user-edit-dialog/user-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">56</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/edit-dialog/workflow-edit-dialog/workflow-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">115</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/permissions-dialog/permissions-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">31</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/common/profile-edit-dialog/profile-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">182</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/document-list/bulk-editor/custom-fields-bulk-edit-dialog/custom-fields-bulk-edit-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">81</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/document-list/save-view-config-dialog/save-view-config-dialog.component.html</context>
|
|
||||||
<context context-type="linenumber">21</context>
|
|
||||||
</context-group>
|
|
||||||
<context-group purpose="location">
|
|
||||||
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
|
|
||||||
<context context-type="linenumber">82</context>
|
|
||||||
</context-group>
|
|
||||||
</trans-unit>
|
|
||||||
<trans-unit id="6839066544204061364" datatype="html">
|
<trans-unit id="6839066544204061364" datatype="html">
|
||||||
<source>Use system language</source>
|
<source>Use system language</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
@@ -5736,7 +5729,7 @@
|
|||||||
<source>Open <x id="PH" equiv-text="this.title"/> filter</source>
|
<source>Open <x id="PH" equiv-text="this.title"/> filter</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/app/components/common/filterable-dropdown/filterable-dropdown.component.ts</context>
|
<context context-type="sourcefile">src/app/components/common/filterable-dropdown/filterable-dropdown.component.ts</context>
|
||||||
<context context-type="linenumber">788</context>
|
<context context-type="linenumber">823</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="7005745151564974365" datatype="html">
|
<trans-unit id="7005745151564974365" datatype="html">
|
||||||
@@ -7489,7 +7482,7 @@
|
|||||||
</context-group>
|
</context-group>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/main.ts</context>
|
<context context-type="sourcefile">src/main.ts</context>
|
||||||
<context context-type="linenumber">411</context>
|
<context context-type="linenumber">416</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="5028777105388019087" datatype="html">
|
<trans-unit id="5028777105388019087" datatype="html">
|
||||||
@@ -7684,6 +7677,13 @@
|
|||||||
<context context-type="linenumber">450</context>
|
<context context-type="linenumber">450</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
|
<trans-unit id="3823219296477075982" datatype="html">
|
||||||
|
<source>Discard</source>
|
||||||
|
<context-group purpose="location">
|
||||||
|
<context context-type="sourcefile">src/app/components/document-detail/document-detail.component.html</context>
|
||||||
|
<context context-type="linenumber">452</context>
|
||||||
|
</context-group>
|
||||||
|
</trans-unit>
|
||||||
<trans-unit id="1309556917227148591" datatype="html">
|
<trans-unit id="1309556917227148591" datatype="html">
|
||||||
<source>Document loading...</source>
|
<source>Document loading...</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
@@ -11352,14 +11352,14 @@
|
|||||||
<source>Prev</source>
|
<source>Prev</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/main.ts</context>
|
<context context-type="sourcefile">src/main.ts</context>
|
||||||
<context context-type="linenumber">410</context>
|
<context context-type="linenumber">415</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
<trans-unit id="1241348629231510663" datatype="html">
|
<trans-unit id="1241348629231510663" datatype="html">
|
||||||
<source>End</source>
|
<source>End</source>
|
||||||
<context-group purpose="location">
|
<context-group purpose="location">
|
||||||
<context context-type="sourcefile">src/main.ts</context>
|
<context context-type="sourcefile">src/main.ts</context>
|
||||||
<context context-type="linenumber">412</context>
|
<context context-type="linenumber">417</context>
|
||||||
</context-group>
|
</context-group>
|
||||||
</trans-unit>
|
</trans-unit>
|
||||||
</body>
|
</body>
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "paperless-ngx-ui",
|
"name": "paperless-ngx-ui",
|
||||||
"version": "2.20.10",
|
"version": "2.20.13",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"preinstall": "npx only-allow pnpm",
|
"preinstall": "npx only-allow pnpm",
|
||||||
"ng": "ng",
|
"ng": "ng",
|
||||||
@@ -11,17 +11,17 @@
|
|||||||
},
|
},
|
||||||
"private": true,
|
"private": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@angular/cdk": "^21.2.0",
|
"@angular/cdk": "^21.2.2",
|
||||||
"@angular/common": "~21.2.0",
|
"@angular/common": "~21.2.4",
|
||||||
"@angular/compiler": "~21.2.0",
|
"@angular/compiler": "~21.2.4",
|
||||||
"@angular/core": "~21.2.0",
|
"@angular/core": "~21.2.4",
|
||||||
"@angular/forms": "~21.2.0",
|
"@angular/forms": "~21.2.4",
|
||||||
"@angular/localize": "~21.2.0",
|
"@angular/localize": "~21.2.4",
|
||||||
"@angular/platform-browser": "~21.2.0",
|
"@angular/platform-browser": "~21.2.4",
|
||||||
"@angular/platform-browser-dynamic": "~21.2.0",
|
"@angular/platform-browser-dynamic": "~21.2.4",
|
||||||
"@angular/router": "~21.2.0",
|
"@angular/router": "~21.2.4",
|
||||||
"@ng-bootstrap/ng-bootstrap": "^20.0.0",
|
"@ng-bootstrap/ng-bootstrap": "^20.0.0",
|
||||||
"@ng-select/ng-select": "^21.4.1",
|
"@ng-select/ng-select": "^21.5.2",
|
||||||
"@ngneat/dirty-check-forms": "^3.0.3",
|
"@ngneat/dirty-check-forms": "^3.0.3",
|
||||||
"@popperjs/core": "^2.11.8",
|
"@popperjs/core": "^2.11.8",
|
||||||
"bootstrap": "^5.3.8",
|
"bootstrap": "^5.3.8",
|
||||||
@@ -42,26 +42,26 @@
|
|||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@angular-builders/custom-webpack": "^21.0.3",
|
"@angular-builders/custom-webpack": "^21.0.3",
|
||||||
"@angular-builders/jest": "^21.0.3",
|
"@angular-builders/jest": "^21.0.3",
|
||||||
"@angular-devkit/core": "^21.2.0",
|
"@angular-devkit/core": "^21.2.2",
|
||||||
"@angular-devkit/schematics": "^21.2.0",
|
"@angular-devkit/schematics": "^21.2.2",
|
||||||
"@angular-eslint/builder": "21.3.0",
|
"@angular-eslint/builder": "21.3.0",
|
||||||
"@angular-eslint/eslint-plugin": "21.3.0",
|
"@angular-eslint/eslint-plugin": "21.3.0",
|
||||||
"@angular-eslint/eslint-plugin-template": "21.3.0",
|
"@angular-eslint/eslint-plugin-template": "21.3.0",
|
||||||
"@angular-eslint/schematics": "21.3.0",
|
"@angular-eslint/schematics": "21.3.0",
|
||||||
"@angular-eslint/template-parser": "21.3.0",
|
"@angular-eslint/template-parser": "21.3.0",
|
||||||
"@angular/build": "^21.2.0",
|
"@angular/build": "^21.2.2",
|
||||||
"@angular/cli": "~21.2.0",
|
"@angular/cli": "~21.2.2",
|
||||||
"@angular/compiler-cli": "~21.2.0",
|
"@angular/compiler-cli": "~21.2.4",
|
||||||
"@codecov/webpack-plugin": "^1.9.1",
|
"@codecov/webpack-plugin": "^1.9.1",
|
||||||
"@playwright/test": "^1.58.2",
|
"@playwright/test": "^1.58.2",
|
||||||
"@types/jest": "^30.0.0",
|
"@types/jest": "^30.0.0",
|
||||||
"@types/node": "^25.3.3",
|
"@types/node": "^25.4.0",
|
||||||
"@typescript-eslint/eslint-plugin": "^8.54.0",
|
"@typescript-eslint/eslint-plugin": "^8.57.0",
|
||||||
"@typescript-eslint/parser": "^8.54.0",
|
"@typescript-eslint/parser": "^8.57.0",
|
||||||
"@typescript-eslint/utils": "^8.54.0",
|
"@typescript-eslint/utils": "^8.57.0",
|
||||||
"eslint": "^10.0.2",
|
"eslint": "^10.0.3",
|
||||||
"jest": "30.2.0",
|
"jest": "30.3.0",
|
||||||
"jest-environment-jsdom": "^30.2.0",
|
"jest-environment-jsdom": "^30.3.0",
|
||||||
"jest-junit": "^16.0.0",
|
"jest-junit": "^16.0.0",
|
||||||
"jest-preset-angular": "^16.1.1",
|
"jest-preset-angular": "^16.1.1",
|
||||||
"jest-websocket-mock": "^2.5.0",
|
"jest-websocket-mock": "^2.5.0",
|
||||||
|
|||||||
1858
src-ui/pnpm-lock.yaml
generated
@@ -59,7 +59,7 @@
|
|||||||
<div [ngbNavOutlet]="nav" class="border-start border-end border-bottom p-3 mb-3 shadow-sm"></div>
|
<div [ngbNavOutlet]="nav" class="border-start border-end border-bottom p-3 mb-3 shadow-sm"></div>
|
||||||
<div class="btn-toolbar" role="toolbar">
|
<div class="btn-toolbar" role="toolbar">
|
||||||
<div class="btn-group me-2">
|
<div class="btn-group me-2">
|
||||||
<button type="button" (click)="discardChanges()" class="btn btn-outline-secondary" [disabled]="loading || (isDirty$ | async) === false" i18n>Discard</button>
|
<button type="button" (click)="discardChanges()" class="btn btn-outline-secondary" [disabled]="loading || (isDirty$ | async) === false" i18n>Cancel</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="btn-group">
|
<div class="btn-group">
|
||||||
<button type="submit" class="btn btn-primary" [disabled]="loading || !configForm.valid || (isDirty$ | async) === false" i18n>Save</button>
|
<button type="submit" class="btn btn-primary" [disabled]="loading || !configForm.valid || (isDirty$ | async) === false" i18n>Save</button>
|
||||||
|
|||||||
@@ -631,6 +631,59 @@ describe('FilterableDropdownComponent & FilterableDropdownSelectionModel', () =>
|
|||||||
])
|
])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('deselecting a parent clears selected descendants', () => {
|
||||||
|
const root: Tag = { id: 100, name: 'Root Tag' }
|
||||||
|
const child: Tag = { id: 101, name: 'Child Tag', parent: root.id }
|
||||||
|
const grandchild: Tag = {
|
||||||
|
id: 102,
|
||||||
|
name: 'Grandchild Tag',
|
||||||
|
parent: child.id,
|
||||||
|
}
|
||||||
|
const other: Tag = { id: 103, name: 'Other Tag' }
|
||||||
|
|
||||||
|
selectionModel.items = [root, child, grandchild, other]
|
||||||
|
selectionModel.set(root.id, ToggleableItemState.Selected, false)
|
||||||
|
selectionModel.set(child.id, ToggleableItemState.Selected, false)
|
||||||
|
selectionModel.set(grandchild.id, ToggleableItemState.Selected, false)
|
||||||
|
selectionModel.set(other.id, ToggleableItemState.Selected, false)
|
||||||
|
|
||||||
|
selectionModel.toggle(root.id, false)
|
||||||
|
|
||||||
|
expect(selectionModel.getSelectedItems()).toEqual([other])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('un-excluding a parent clears excluded descendants', () => {
|
||||||
|
const root: Tag = { id: 110, name: 'Root Tag' }
|
||||||
|
const child: Tag = { id: 111, name: 'Child Tag', parent: root.id }
|
||||||
|
const other: Tag = { id: 112, name: 'Other Tag' }
|
||||||
|
|
||||||
|
selectionModel.items = [root, child, other]
|
||||||
|
selectionModel.set(root.id, ToggleableItemState.Excluded, false)
|
||||||
|
selectionModel.set(child.id, ToggleableItemState.Excluded, false)
|
||||||
|
selectionModel.set(other.id, ToggleableItemState.Excluded, false)
|
||||||
|
|
||||||
|
selectionModel.exclude(root.id, false)
|
||||||
|
|
||||||
|
expect(selectionModel.getExcludedItems()).toEqual([other])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('excluding a selected parent clears selected descendants', () => {
|
||||||
|
const root: Tag = { id: 120, name: 'Root Tag' }
|
||||||
|
const child: Tag = { id: 121, name: 'Child Tag', parent: root.id }
|
||||||
|
const other: Tag = { id: 122, name: 'Other Tag' }
|
||||||
|
|
||||||
|
selectionModel.manyToOne = true
|
||||||
|
selectionModel.items = [root, child, other]
|
||||||
|
selectionModel.set(root.id, ToggleableItemState.Selected, false)
|
||||||
|
selectionModel.set(child.id, ToggleableItemState.Selected, false)
|
||||||
|
selectionModel.set(other.id, ToggleableItemState.Selected, false)
|
||||||
|
|
||||||
|
selectionModel.exclude(root.id, false)
|
||||||
|
|
||||||
|
expect(selectionModel.getExcludedItems()).toEqual([root])
|
||||||
|
expect(selectionModel.getSelectedItems()).toEqual([other])
|
||||||
|
})
|
||||||
|
|
||||||
it('resorts items immediately when document count sorting enabled', () => {
|
it('resorts items immediately when document count sorting enabled', () => {
|
||||||
const apple: Tag = { id: 55, name: 'Apple' }
|
const apple: Tag = { id: 55, name: 'Apple' }
|
||||||
const zebra: Tag = { id: 56, name: 'Zebra' }
|
const zebra: Tag = { id: 56, name: 'Zebra' }
|
||||||
|
|||||||
@@ -235,6 +235,7 @@ export class FilterableDropdownSelectionModel {
|
|||||||
state == ToggleableItemState.Excluded
|
state == ToggleableItemState.Excluded
|
||||||
) {
|
) {
|
||||||
this.temporarySelectionStates.delete(id)
|
this.temporarySelectionStates.delete(id)
|
||||||
|
this.clearDescendantSelections(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!id) {
|
if (!id) {
|
||||||
@@ -261,6 +262,7 @@ export class FilterableDropdownSelectionModel {
|
|||||||
|
|
||||||
if (this.manyToOne || this.singleSelect) {
|
if (this.manyToOne || this.singleSelect) {
|
||||||
this.temporarySelectionStates.set(id, ToggleableItemState.Excluded)
|
this.temporarySelectionStates.set(id, ToggleableItemState.Excluded)
|
||||||
|
this.clearDescendantSelections(id)
|
||||||
|
|
||||||
if (this.singleSelect) {
|
if (this.singleSelect) {
|
||||||
for (let key of this.temporarySelectionStates.keys()) {
|
for (let key of this.temporarySelectionStates.keys()) {
|
||||||
@@ -281,9 +283,15 @@ export class FilterableDropdownSelectionModel {
|
|||||||
newState = ToggleableItemState.NotSelected
|
newState = ToggleableItemState.NotSelected
|
||||||
}
|
}
|
||||||
this.temporarySelectionStates.set(id, newState)
|
this.temporarySelectionStates.set(id, newState)
|
||||||
|
if (newState == ToggleableItemState.Excluded) {
|
||||||
|
this.clearDescendantSelections(id)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (!id || state == ToggleableItemState.Excluded) {
|
} else if (!id || state == ToggleableItemState.Excluded) {
|
||||||
this.temporarySelectionStates.delete(id)
|
this.temporarySelectionStates.delete(id)
|
||||||
|
if (id) {
|
||||||
|
this.clearDescendantSelections(id)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fireEvent) {
|
if (fireEvent) {
|
||||||
@@ -295,6 +303,33 @@ export class FilterableDropdownSelectionModel {
|
|||||||
return this.selectionStates.get(id) || ToggleableItemState.NotSelected
|
return this.selectionStates.get(id) || ToggleableItemState.NotSelected
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private clearDescendantSelections(id: number) {
|
||||||
|
for (const descendantID of this.getDescendantIDs(id)) {
|
||||||
|
this.temporarySelectionStates.delete(descendantID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private getDescendantIDs(id: number): number[] {
|
||||||
|
const descendants: number[] = []
|
||||||
|
const queue: number[] = [id]
|
||||||
|
|
||||||
|
while (queue.length) {
|
||||||
|
const parentID = queue.shift()
|
||||||
|
for (const item of this._items) {
|
||||||
|
if (
|
||||||
|
typeof item?.id === 'number' &&
|
||||||
|
typeof (item as any)['parent'] === 'number' &&
|
||||||
|
(item as any)['parent'] === parentID
|
||||||
|
) {
|
||||||
|
descendants.push(item.id)
|
||||||
|
queue.push(item.id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return descendants
|
||||||
|
}
|
||||||
|
|
||||||
get logicalOperator(): LogicalOperator {
|
get logicalOperator(): LogicalOperator {
|
||||||
return this.temporaryLogicalOperator
|
return this.temporaryLogicalOperator
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
@if (document && displayFields?.includes(DisplayField.TAGS)) {
|
@if (document && displayFields?.includes(DisplayField.TAGS)) {
|
||||||
<div class="tags d-flex flex-column text-end position-absolute me-1 fs-6">
|
<div class="tags d-flex flex-column text-end position-absolute me-1 fs-6" [class.tags-no-wrap]="document.tags.length > 3">
|
||||||
@for (tagID of tagIDs; track tagID) {
|
@for (tagID of tagIDs; track tagID) {
|
||||||
<pngx-tag [tagID]="tagID" (click)="clickTag.emit(tagID);$event.stopPropagation()" [clickable]="true" linkTitle="Toggle tag filter" i18n-linkTitle></pngx-tag>
|
<pngx-tag [tagID]="tagID" (click)="clickTag.emit(tagID);$event.stopPropagation()" [clickable]="true" linkTitle="Toggle tag filter" i18n-linkTitle></pngx-tag>
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,4 +72,14 @@ a {
|
|||||||
max-width: 80%;
|
max-width: 80%;
|
||||||
row-gap: .2rem;
|
row-gap: .2rem;
|
||||||
line-height: 1;
|
line-height: 1;
|
||||||
|
|
||||||
|
&.tags-no-wrap {
|
||||||
|
::ng-deep .badge {
|
||||||
|
display: inline-block;
|
||||||
|
max-width: 100%;
|
||||||
|
white-space: nowrap;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -82,6 +82,16 @@ describe('DocumentCardSmallComponent', () => {
|
|||||||
).toHaveLength(6)
|
).toHaveLength(6)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('should clear hidden tag counter when tag count falls below the limit', () => {
|
||||||
|
expect(component.moreTags).toEqual(3)
|
||||||
|
|
||||||
|
component.document.tags = [1, 2, 3, 4, 5, 6]
|
||||||
|
fixture.detectChanges()
|
||||||
|
|
||||||
|
expect(component.moreTags).toBeNull()
|
||||||
|
expect(fixture.nativeElement.textContent).not.toContain('+ 3')
|
||||||
|
})
|
||||||
|
|
||||||
it('should try to close the preview on mouse leave', () => {
|
it('should try to close the preview on mouse leave', () => {
|
||||||
component.popupPreview = {
|
component.popupPreview = {
|
||||||
close: jest.fn(),
|
close: jest.fn(),
|
||||||
|
|||||||
@@ -126,6 +126,7 @@ export class DocumentCardSmallComponent
|
|||||||
this.moreTags = this.document.tags.length - (limit - 1)
|
this.moreTags = this.document.tags.length - (limit - 1)
|
||||||
return this.document.tags.slice(0, limit - 1)
|
return this.document.tags.slice(0, limit - 1)
|
||||||
} else {
|
} else {
|
||||||
|
this.moreTags = null
|
||||||
return this.document.tags
|
return this.document.tags
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
122
src-ui/src/app/interceptors/auth-expiry.interceptor.spec.ts
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
import {
|
||||||
|
HttpErrorResponse,
|
||||||
|
HttpHandlerFn,
|
||||||
|
HttpRequest,
|
||||||
|
} from '@angular/common/http'
|
||||||
|
import { throwError } from 'rxjs'
|
||||||
|
import * as navUtils from '../utils/navigation'
|
||||||
|
import { createAuthExpiryInterceptor } from './auth-expiry.interceptor'
|
||||||
|
|
||||||
|
describe('withAuthExpiryInterceptor', () => {
|
||||||
|
let interceptor: ReturnType<typeof createAuthExpiryInterceptor>
|
||||||
|
let dateNowSpy: jest.SpiedFunction<typeof Date.now>
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
interceptor = createAuthExpiryInterceptor()
|
||||||
|
dateNowSpy = jest.spyOn(Date, 'now').mockReturnValue(1000)
|
||||||
|
})
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
jest.restoreAllMocks()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('reloads when an API request returns 401', () => {
|
||||||
|
const reloadSpy = jest
|
||||||
|
.spyOn(navUtils, 'locationReload')
|
||||||
|
.mockImplementation(() => {})
|
||||||
|
|
||||||
|
interceptor(
|
||||||
|
new HttpRequest('GET', '/api/documents/'),
|
||||||
|
failingHandler('/api/documents/', 401)
|
||||||
|
).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(reloadSpy).toHaveBeenCalledTimes(1)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('does not reload for non-401 errors', () => {
|
||||||
|
const reloadSpy = jest
|
||||||
|
.spyOn(navUtils, 'locationReload')
|
||||||
|
.mockImplementation(() => {})
|
||||||
|
|
||||||
|
interceptor(
|
||||||
|
new HttpRequest('GET', '/api/documents/'),
|
||||||
|
failingHandler('/api/documents/', 500)
|
||||||
|
).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(reloadSpy).not.toHaveBeenCalled()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('does not reload for non-api 401 responses', () => {
|
||||||
|
const reloadSpy = jest
|
||||||
|
.spyOn(navUtils, 'locationReload')
|
||||||
|
.mockImplementation(() => {})
|
||||||
|
|
||||||
|
interceptor(
|
||||||
|
new HttpRequest('GET', '/accounts/profile/'),
|
||||||
|
failingHandler('/accounts/profile/', 401)
|
||||||
|
).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(reloadSpy).not.toHaveBeenCalled()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('reloads only once even with multiple API 401 responses', () => {
|
||||||
|
const reloadSpy = jest
|
||||||
|
.spyOn(navUtils, 'locationReload')
|
||||||
|
.mockImplementation(() => {})
|
||||||
|
|
||||||
|
const request = new HttpRequest('GET', '/api/documents/')
|
||||||
|
const handler = failingHandler('/api/documents/', 401)
|
||||||
|
|
||||||
|
interceptor(request, handler).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
interceptor(request, handler).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(reloadSpy).toHaveBeenCalledTimes(1)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('retries reload after cooldown for repeated API 401 responses', () => {
|
||||||
|
const reloadSpy = jest
|
||||||
|
.spyOn(navUtils, 'locationReload')
|
||||||
|
.mockImplementation(() => {})
|
||||||
|
|
||||||
|
dateNowSpy
|
||||||
|
.mockReturnValueOnce(1000)
|
||||||
|
.mockReturnValueOnce(2500)
|
||||||
|
.mockReturnValueOnce(3501)
|
||||||
|
|
||||||
|
const request = new HttpRequest('GET', '/api/documents/')
|
||||||
|
const handler = failingHandler('/api/documents/', 401)
|
||||||
|
|
||||||
|
interceptor(request, handler).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
interceptor(request, handler).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
interceptor(request, handler).subscribe({
|
||||||
|
error: () => undefined,
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(reloadSpy).toHaveBeenCalledTimes(2)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
function failingHandler(url: string, status: number): HttpHandlerFn {
|
||||||
|
return (_request) =>
|
||||||
|
throwError(
|
||||||
|
() =>
|
||||||
|
new HttpErrorResponse({
|
||||||
|
status,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
37
src-ui/src/app/interceptors/auth-expiry.interceptor.ts
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import {
|
||||||
|
HttpErrorResponse,
|
||||||
|
HttpEvent,
|
||||||
|
HttpHandlerFn,
|
||||||
|
HttpInterceptorFn,
|
||||||
|
HttpRequest,
|
||||||
|
} from '@angular/common/http'
|
||||||
|
import { catchError, Observable, throwError } from 'rxjs'
|
||||||
|
import { locationReload } from '../utils/navigation'
|
||||||
|
|
||||||
|
export const createAuthExpiryInterceptor = (): HttpInterceptorFn => {
|
||||||
|
let lastReloadAttempt = Number.NEGATIVE_INFINITY
|
||||||
|
|
||||||
|
return (
|
||||||
|
request: HttpRequest<unknown>,
|
||||||
|
next: HttpHandlerFn
|
||||||
|
): Observable<HttpEvent<unknown>> =>
|
||||||
|
next(request).pipe(
|
||||||
|
catchError((error: unknown) => {
|
||||||
|
if (
|
||||||
|
error instanceof HttpErrorResponse &&
|
||||||
|
error.status === 401 &&
|
||||||
|
request.url.includes('/api/')
|
||||||
|
) {
|
||||||
|
const now = Date.now()
|
||||||
|
if (now - lastReloadAttempt >= 2000) {
|
||||||
|
lastReloadAttempt = now
|
||||||
|
locationReload()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return throwError(() => error)
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export const withAuthExpiryInterceptor = createAuthExpiryInterceptor()
|
||||||
@@ -6,7 +6,7 @@ export const environment = {
|
|||||||
apiVersion: '10', // match src/paperless/settings.py
|
apiVersion: '10', // match src/paperless/settings.py
|
||||||
appTitle: 'Paperless-ngx',
|
appTitle: 'Paperless-ngx',
|
||||||
tag: 'prod',
|
tag: 'prod',
|
||||||
version: '2.20.10',
|
version: '2.20.13',
|
||||||
webSocketHost: window.location.host,
|
webSocketHost: window.location.host,
|
||||||
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
|
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
|
||||||
webSocketBaseUrl: base_url.pathname + 'ws/',
|
webSocketBaseUrl: base_url.pathname + 'ws/',
|
||||||
|
|||||||
@@ -154,6 +154,7 @@ import { DirtyDocGuard } from './app/guards/dirty-doc.guard'
|
|||||||
import { DirtySavedViewGuard } from './app/guards/dirty-saved-view.guard'
|
import { DirtySavedViewGuard } from './app/guards/dirty-saved-view.guard'
|
||||||
import { PermissionsGuard } from './app/guards/permissions.guard'
|
import { PermissionsGuard } from './app/guards/permissions.guard'
|
||||||
import { withApiVersionInterceptor } from './app/interceptors/api-version.interceptor'
|
import { withApiVersionInterceptor } from './app/interceptors/api-version.interceptor'
|
||||||
|
import { withAuthExpiryInterceptor } from './app/interceptors/auth-expiry.interceptor'
|
||||||
import { withCsrfInterceptor } from './app/interceptors/csrf.interceptor'
|
import { withCsrfInterceptor } from './app/interceptors/csrf.interceptor'
|
||||||
import { DocumentTitlePipe } from './app/pipes/document-title.pipe'
|
import { DocumentTitlePipe } from './app/pipes/document-title.pipe'
|
||||||
import { FilterPipe } from './app/pipes/filter.pipe'
|
import { FilterPipe } from './app/pipes/filter.pipe'
|
||||||
@@ -399,7 +400,11 @@ bootstrapApplication(AppComponent, {
|
|||||||
StoragePathNamePipe,
|
StoragePathNamePipe,
|
||||||
provideHttpClient(
|
provideHttpClient(
|
||||||
withInterceptorsFromDi(),
|
withInterceptorsFromDi(),
|
||||||
withInterceptors([withCsrfInterceptor, withApiVersionInterceptor]),
|
withInterceptors([
|
||||||
|
withCsrfInterceptor,
|
||||||
|
withApiVersionInterceptor,
|
||||||
|
withAuthExpiryInterceptor,
|
||||||
|
]),
|
||||||
withFetch()
|
withFetch()
|
||||||
),
|
),
|
||||||
provideUiTour({
|
provideUiTour({
|
||||||
|
|||||||
@@ -150,6 +150,15 @@ $form-check-radio-checked-bg-image-dark: url("data:image/svg+xml,<svg xmlns='htt
|
|||||||
background-color: var(--pngx-body-color-accent);
|
background-color: var(--pngx-body-color-accent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.list-group-item-action:not(.active):active {
|
||||||
|
--bs-list-group-action-active-color: var(--bs-body-color);
|
||||||
|
--bs-list-group-action-active-bg: var(--pngx-bg-darker);
|
||||||
|
}
|
||||||
|
|
||||||
|
.form-control:hover::file-selector-button {
|
||||||
|
background-color:var(--pngx-bg-dark) !important
|
||||||
|
}
|
||||||
|
|
||||||
.search-container {
|
.search-container {
|
||||||
input, input:focus, i-bs[name="search"] , ::placeholder {
|
input, input:focus, i-bs[name="search"] , ::placeholder {
|
||||||
color: var(--pngx-primary-text-contrast) !important;
|
color: var(--pngx-primary-text-contrast) !important;
|
||||||
|
|||||||
@@ -3,24 +3,19 @@ from django.core.checks import Error
|
|||||||
from django.core.checks import Warning
|
from django.core.checks import Warning
|
||||||
from django.core.checks import register
|
from django.core.checks import register
|
||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
from documents.templating.utils import convert_format_str_to_template_format
|
from documents.templating.utils import convert_format_str_to_template_format
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def parser_check(app_configs, **kwargs):
|
def parser_check(app_configs, **kwargs):
|
||||||
parsers = []
|
if not get_parser_registry().all_parsers():
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parsers.append(response[1])
|
|
||||||
|
|
||||||
if len(parsers) == 0:
|
|
||||||
return [
|
return [
|
||||||
Error(
|
Error(
|
||||||
"No parsers found. This is a bug. The consumer won't be "
|
"No parsers found. This is a bug. The consumer won't be "
|
||||||
"able to consume any documents without parsers.",
|
"able to consume any documents without parsers.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
else:
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from pathlib import Path
|
|||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
@@ -191,7 +192,12 @@ class DocumentClassifier:
|
|||||||
|
|
||||||
target_file_temp.rename(target_file)
|
target_file_temp.rename(target_file)
|
||||||
|
|
||||||
def train(self) -> bool:
|
def train(
|
||||||
|
self,
|
||||||
|
status_callback: Callable[[str], None] | None = None,
|
||||||
|
) -> bool:
|
||||||
|
notify = status_callback if status_callback is not None else lambda _: None
|
||||||
|
|
||||||
# Get non-inbox documents
|
# Get non-inbox documents
|
||||||
docs_queryset = (
|
docs_queryset = (
|
||||||
Document.objects.exclude(
|
Document.objects.exclude(
|
||||||
@@ -213,6 +219,7 @@ class DocumentClassifier:
|
|||||||
|
|
||||||
# Step 1: Extract and preprocess training data from the database.
|
# Step 1: Extract and preprocess training data from the database.
|
||||||
logger.debug("Gathering data from database...")
|
logger.debug("Gathering data from database...")
|
||||||
|
notify(f"Gathering data from {docs_queryset.count()} document(s)...")
|
||||||
hasher = sha256()
|
hasher = sha256()
|
||||||
for doc in docs_queryset:
|
for doc in docs_queryset:
|
||||||
y = -1
|
y = -1
|
||||||
@@ -290,6 +297,7 @@ class DocumentClassifier:
|
|||||||
|
|
||||||
# Step 2: vectorize data
|
# Step 2: vectorize data
|
||||||
logger.debug("Vectorizing data...")
|
logger.debug("Vectorizing data...")
|
||||||
|
notify("Vectorizing document content...")
|
||||||
|
|
||||||
def content_generator() -> Iterator[str]:
|
def content_generator() -> Iterator[str]:
|
||||||
"""
|
"""
|
||||||
@@ -316,6 +324,7 @@ class DocumentClassifier:
|
|||||||
# Step 3: train the classifiers
|
# Step 3: train the classifiers
|
||||||
if num_tags > 0:
|
if num_tags > 0:
|
||||||
logger.debug("Training tags classifier...")
|
logger.debug("Training tags classifier...")
|
||||||
|
notify(f"Training tags classifier ({num_tags} tag(s))...")
|
||||||
|
|
||||||
if num_tags == 1:
|
if num_tags == 1:
|
||||||
# Special case where only one tag has auto:
|
# Special case where only one tag has auto:
|
||||||
@@ -339,6 +348,9 @@ class DocumentClassifier:
|
|||||||
|
|
||||||
if num_correspondents > 0:
|
if num_correspondents > 0:
|
||||||
logger.debug("Training correspondent classifier...")
|
logger.debug("Training correspondent classifier...")
|
||||||
|
notify(
|
||||||
|
f"Training correspondent classifier ({num_correspondents} correspondent(s))...",
|
||||||
|
)
|
||||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||||
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
|
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
|
||||||
else:
|
else:
|
||||||
@@ -349,6 +361,9 @@ class DocumentClassifier:
|
|||||||
|
|
||||||
if num_document_types > 0:
|
if num_document_types > 0:
|
||||||
logger.debug("Training document type classifier...")
|
logger.debug("Training document type classifier...")
|
||||||
|
notify(
|
||||||
|
f"Training document type classifier ({num_document_types} type(s))...",
|
||||||
|
)
|
||||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||||
self.document_type_classifier.fit(data_vectorized, labels_document_type)
|
self.document_type_classifier.fit(data_vectorized, labels_document_type)
|
||||||
else:
|
else:
|
||||||
@@ -361,6 +376,7 @@ class DocumentClassifier:
|
|||||||
logger.debug(
|
logger.debug(
|
||||||
"Training storage paths classifier...",
|
"Training storage paths classifier...",
|
||||||
)
|
)
|
||||||
|
notify(f"Training storage path classifier ({num_storage_paths} path(s))...")
|
||||||
self.storage_path_classifier = MLPClassifier(tol=0.01)
|
self.storage_path_classifier = MLPClassifier(tol=0.01)
|
||||||
self.storage_path_classifier.fit(
|
self.storage_path_classifier.fit(
|
||||||
data_vectorized,
|
data_vectorized,
|
||||||
|
|||||||
@@ -32,9 +32,7 @@ from documents.models import DocumentType
|
|||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
from documents.plugins.base import AlwaysRunPluginMixin
|
from documents.plugins.base import AlwaysRunPluginMixin
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
@@ -51,28 +49,13 @@ from documents.templating.workflows import parse_w_workflow_placeholders
|
|||||||
from documents.utils import copy_basic_file_stats
|
from documents.utils import copy_basic_file_stats
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers import ParserContext
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||||
|
|
||||||
|
|
||||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
|
||||||
"""
|
|
||||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
|
||||||
|
|
||||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
|
||||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
|
||||||
have switched to the new style and this consumer is updated to use it
|
|
||||||
|
|
||||||
TODO(stumpylog): Remove me in the future
|
|
||||||
"""
|
|
||||||
if isinstance(parser, TextDocumentParser):
|
|
||||||
parser.__exit__(None, None, None)
|
|
||||||
else:
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class WorkflowTriggerPlugin(
|
class WorkflowTriggerPlugin(
|
||||||
NoCleanupPluginMixin,
|
NoCleanupPluginMixin,
|
||||||
NoSetupPluginMixin,
|
NoSetupPluginMixin,
|
||||||
@@ -409,8 +392,12 @@ class ConsumerPlugin(
|
|||||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||||
|
|
||||||
# Based on the mime type, get the parser for that type
|
# Based on the mime type, get the parser for that type
|
||||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
parser_class: type[ParserProtocol] | None = (
|
||||||
|
get_parser_registry().get_parser_for_file(
|
||||||
mime_type,
|
mime_type,
|
||||||
|
self.filename,
|
||||||
|
self.working_copy,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
@@ -433,22 +420,13 @@ class ConsumerPlugin(
|
|||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def progress_callback(
|
|
||||||
current_progress,
|
|
||||||
max_progress,
|
|
||||||
) -> None: # pragma: no cover
|
|
||||||
# recalculate progress to be within 20 and 80
|
|
||||||
p = int((current_progress / max_progress) * 50 + 20)
|
|
||||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
|
||||||
|
|
||||||
# This doesn't parse the document yet, but gives us a parser.
|
# This doesn't parse the document yet, but gives us a parser.
|
||||||
|
with parser_class() as document_parser:
|
||||||
document_parser: DocumentParser = parser_class(
|
document_parser.configure(
|
||||||
self.logging_group,
|
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||||
progress_callback=progress_callback,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||||
|
|
||||||
# Parse the document. This may take some time.
|
# Parse the document. This may take some time.
|
||||||
|
|
||||||
@@ -466,21 +444,8 @@ class ConsumerPlugin(
|
|||||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||||
)
|
)
|
||||||
self.log.debug(f"Parsing {self.filename}...")
|
self.log.debug(f"Parsing {self.filename}...")
|
||||||
if (
|
|
||||||
isinstance(document_parser, MailDocumentParser)
|
|
||||||
and self.input_doc.mailrule_id
|
|
||||||
):
|
|
||||||
document_parser.parse(
|
|
||||||
self.working_copy,
|
|
||||||
mime_type,
|
|
||||||
self.filename,
|
|
||||||
self.input_doc.mailrule_id,
|
|
||||||
)
|
|
||||||
elif isinstance(document_parser, TextDocumentParser):
|
|
||||||
# TODO(stumpylog): Remove me in the future
|
|
||||||
document_parser.parse(self.working_copy, mime_type)
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
else:
|
|
||||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
|
||||||
|
|
||||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||||
self._send_progress(
|
self._send_progress(
|
||||||
@@ -489,15 +454,7 @@ class ConsumerPlugin(
|
|||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||||
)
|
)
|
||||||
if isinstance(document_parser, TextDocumentParser):
|
|
||||||
# TODO(stumpylog): Remove me in the future
|
|
||||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||||
else:
|
|
||||||
thumbnail = document_parser.get_thumbnail(
|
|
||||||
self.working_copy,
|
|
||||||
mime_type,
|
|
||||||
self.filename,
|
|
||||||
)
|
|
||||||
|
|
||||||
text = document_parser.get_text()
|
text = document_parser.get_text()
|
||||||
date = document_parser.get_date()
|
date = document_parser.get_date()
|
||||||
@@ -511,10 +468,12 @@ class ConsumerPlugin(
|
|||||||
with get_date_parser() as date_parser:
|
with get_date_parser() as date_parser:
|
||||||
date = next(date_parser.parse(self.filename, text), None)
|
date = next(date_parser.parse(self.filename, text), None)
|
||||||
archive_path = document_parser.get_archive_path()
|
archive_path = document_parser.get_archive_path()
|
||||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
page_count = document_parser.get_page_count(
|
||||||
|
self.working_copy,
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
if tempdir:
|
if tempdir:
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
self._fail(
|
self._fail(
|
||||||
@@ -524,7 +483,6 @@ class ConsumerPlugin(
|
|||||||
exception=e,
|
exception=e,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
if tempdir:
|
if tempdir:
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
self._fail(
|
self._fail(
|
||||||
@@ -573,7 +531,9 @@ class ConsumerPlugin(
|
|||||||
settings.AUDIT_LOG_ENABLED
|
settings.AUDIT_LOG_ENABLED
|
||||||
and self.metadata.actor_id is not None
|
and self.metadata.actor_id is not None
|
||||||
):
|
):
|
||||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
actor = User.objects.filter(
|
||||||
|
pk=self.metadata.actor_id,
|
||||||
|
).first()
|
||||||
if actor is not None:
|
if actor is not None:
|
||||||
from auditlog.context import ( # type: ignore[import-untyped]
|
from auditlog.context import ( # type: ignore[import-untyped]
|
||||||
set_actor,
|
set_actor,
|
||||||
@@ -697,7 +657,9 @@ class ConsumerPlugin(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Delete the file only if it was successfully consumed
|
# Delete the file only if it was successfully consumed
|
||||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
self.log.debug(
|
||||||
|
f"Deleting original file {self.input_doc.original_file}",
|
||||||
|
)
|
||||||
self.input_doc.original_file.unlink()
|
self.input_doc.original_file.unlink()
|
||||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||||
self.working_copy.unlink()
|
self.working_copy.unlink()
|
||||||
@@ -726,7 +688,6 @@ class ConsumerPlugin(
|
|||||||
exception=e,
|
exception=e,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
_parser_cleanup(document_parser)
|
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
|
|
||||||
self.run_post_consume_script(document)
|
self.run_post_consume_script(document)
|
||||||
|
|||||||
@@ -477,6 +477,13 @@ class DelayedFullTextQuery(DelayedQuery):
|
|||||||
try:
|
try:
|
||||||
corrected = self.searcher.correct_query(q, q_str)
|
corrected = self.searcher.correct_query(q, q_str)
|
||||||
if corrected.string != q_str:
|
if corrected.string != q_str:
|
||||||
|
corrected_results = self.searcher.search(
|
||||||
|
corrected.query,
|
||||||
|
limit=1,
|
||||||
|
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||||
|
scored=False,
|
||||||
|
)
|
||||||
|
if len(corrected_results) > 0:
|
||||||
suggested_correction = corrected.string
|
suggested_correction = corrected.string
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -1,13 +1,32 @@
|
|||||||
from django.core.management.base import BaseCommand
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from documents.management.commands.base import PaperlessCommand
|
||||||
from documents.tasks import train_classifier
|
from documents.tasks import train_classifier
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(PaperlessCommand):
|
||||||
help = (
|
help = (
|
||||||
"Trains the classifier on your data and saves the resulting models to a "
|
"Trains the classifier on your data and saves the resulting models to a "
|
||||||
"file. The document consumer will then automatically use this new model."
|
"file. The document consumer will then automatically use this new model."
|
||||||
)
|
)
|
||||||
|
supports_progress_bar = False
|
||||||
|
supports_multiprocessing = False
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options) -> None:
|
||||||
train_classifier(scheduled=False)
|
start = time.monotonic()
|
||||||
|
|
||||||
|
with (
|
||||||
|
self.buffered_logging("paperless.tasks"),
|
||||||
|
self.buffered_logging("paperless.classifier"),
|
||||||
|
):
|
||||||
|
train_classifier(
|
||||||
|
scheduled=False,
|
||||||
|
status_callback=lambda msg: self.console.print(f" {msg}"),
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
self.console.print(
|
||||||
|
f"[green]✓[/green] Classifier training complete ({elapsed:.1f}s)",
|
||||||
|
)
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
ContentType.objects.all().delete()
|
ContentType.objects.all().delete()
|
||||||
Permission.objects.all().delete()
|
Permission.objects.all().delete()
|
||||||
for manifest_path in self.manifest_paths:
|
for manifest_path in self.manifest_paths:
|
||||||
call_command("loaddata", manifest_path)
|
call_command("loaddata", manifest_path, skip_checks=True)
|
||||||
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
||||||
self.stdout.write(self.style.ERROR("Database import failed"))
|
self.stdout.write(self.style.ERROR("Database import failed"))
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -3,14 +3,18 @@ import shutil
|
|||||||
|
|
||||||
from documents.management.commands.base import PaperlessCommand
|
from documents.management.commands.base import PaperlessCommand
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.management.thumbnails")
|
logger = logging.getLogger("paperless.management.thumbnails")
|
||||||
|
|
||||||
|
|
||||||
def _process_document(doc_id: int) -> None:
|
def _process_document(doc_id: int) -> None:
|
||||||
document: Document = Document.objects.get(id=doc_id)
|
document: Document = Document.objects.get(id=doc_id)
|
||||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
|
document.mime_type,
|
||||||
|
document.original_filename or "",
|
||||||
|
document.source_path,
|
||||||
|
)
|
||||||
|
|
||||||
if parser_class is None:
|
if parser_class is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@@ -20,18 +24,9 @@ def _process_document(doc_id: int) -> None:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
parser = parser_class(logging_group=None)
|
with parser_class() as parser:
|
||||||
|
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||||
try:
|
|
||||||
thumb = parser.get_thumbnail(
|
|
||||||
document.source_path,
|
|
||||||
document.mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
shutil.move(thumb, document.thumbnail_path)
|
shutil.move(thumb, document.thumbnail_path)
|
||||||
finally:
|
|
||||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class Command(PaperlessCommand):
|
class Command(PaperlessCommand):
|
||||||
|
|||||||
@@ -3,84 +3,47 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from functools import lru_cache
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from documents.loggers import LoggingMixin
|
from documents.loggers import LoggingMixin
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
# This regular expression will try to find dates in the document at
|
|
||||||
# hand and will match the following formats:
|
|
||||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
|
||||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
|
||||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
|
||||||
|
|
||||||
# TODO: isn't there a date parsing library for this?
|
|
||||||
|
|
||||||
DATE_REGEX = re.compile(
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
|
||||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.parsing")
|
logger = logging.getLogger("paperless.parsing")
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def is_mime_type_supported(mime_type: str) -> bool:
|
def is_mime_type_supported(mime_type: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Returns True if the mime type is supported, False otherwise
|
Returns True if the mime type is supported, False otherwise
|
||||||
"""
|
"""
|
||||||
return get_parser_class_for_mime_type(mime_type) is not None
|
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def get_default_file_extension(mime_type: str) -> str:
|
def get_default_file_extension(mime_type: str) -> str:
|
||||||
"""
|
"""
|
||||||
Returns the default file extension for a mimetype, or
|
Returns the default file extension for a mimetype, or
|
||||||
an empty string if it could not be determined
|
an empty string if it could not be determined
|
||||||
"""
|
"""
|
||||||
for response in document_consumer_declaration.send(None):
|
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
||||||
parser_declaration = response[1]
|
if parser_class is not None:
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
supported = parser_class.supported_mime_types()
|
||||||
|
if mime_type in supported:
|
||||||
if mime_type in supported_mime_types:
|
return supported[mime_type]
|
||||||
return supported_mime_types[mime_type]
|
|
||||||
|
|
||||||
ext = mimetypes.guess_extension(mime_type)
|
ext = mimetypes.guess_extension(mime_type)
|
||||||
if ext:
|
return ext if ext else ""
|
||||||
return ext
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def is_file_ext_supported(ext: str) -> bool:
|
def is_file_ext_supported(ext: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Returns True if the file extension is supported, False otherwise
|
Returns True if the file extension is supported, False otherwise
|
||||||
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:
|
|||||||
|
|
||||||
def get_supported_file_extensions() -> set[str]:
|
def get_supported_file_extensions() -> set[str]:
|
||||||
extensions = set()
|
extensions = set()
|
||||||
for response in document_consumer_declaration.send(None):
|
for parser_class in get_parser_registry().all_parsers():
|
||||||
parser_declaration = response[1]
|
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
|
||||||
|
|
||||||
for mime_type in supported_mime_types:
|
|
||||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||||
# Python's stdlib might be behind, so also add what the parser
|
# Python's stdlib might be behind, so also add what the parser
|
||||||
# says is the default extension
|
# says is the default extension
|
||||||
# This makes image/webp supported on Python < 3.11
|
# This makes image/webp supported on Python < 3.11
|
||||||
extensions.add(supported_mime_types[mime_type])
|
extensions.add(ext)
|
||||||
|
|
||||||
return extensions
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
|
||||||
"""
|
|
||||||
Returns the best parser (by weight) for the given mimetype or
|
|
||||||
None if no parser exists
|
|
||||||
"""
|
|
||||||
|
|
||||||
options = []
|
|
||||||
|
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parser_declaration = response[1]
|
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
|
||||||
|
|
||||||
if mime_type in supported_mime_types:
|
|
||||||
options.append(parser_declaration)
|
|
||||||
|
|
||||||
if not options:
|
|
||||||
return None
|
|
||||||
|
|
||||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
|
||||||
|
|
||||||
# Return the parser with the highest weight.
|
|
||||||
return best_parser["parser"]
|
|
||||||
|
|
||||||
|
|
||||||
def run_convert(
|
def run_convert(
|
||||||
input_file,
|
input_file,
|
||||||
output_file,
|
output_file,
|
||||||
|
|||||||
@@ -797,6 +797,25 @@ class ReadWriteSerializerMethodField(serializers.SerializerMethodField):
|
|||||||
return {self.field_name: data}
|
return {self.field_name: data}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_documentlink_targets(user, doc_ids):
|
||||||
|
if Document.objects.filter(id__in=doc_ids).count() != len(doc_ids):
|
||||||
|
raise serializers.ValidationError(
|
||||||
|
"Some documents in value don't exist or were specified twice.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if user is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
target_documents = Document.objects.filter(id__in=doc_ids).select_related("owner")
|
||||||
|
if not all(
|
||||||
|
has_perms_owner_aware(user, "change_document", document)
|
||||||
|
for document in target_documents
|
||||||
|
):
|
||||||
|
raise PermissionDenied(
|
||||||
|
_("Insufficient permissions."),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CustomFieldInstanceSerializer(serializers.ModelSerializer):
|
class CustomFieldInstanceSerializer(serializers.ModelSerializer):
|
||||||
field = serializers.PrimaryKeyRelatedField(queryset=CustomField.objects.all())
|
field = serializers.PrimaryKeyRelatedField(queryset=CustomField.objects.all())
|
||||||
value = ReadWriteSerializerMethodField(allow_null=True)
|
value = ReadWriteSerializerMethodField(allow_null=True)
|
||||||
@@ -887,11 +906,10 @@ class CustomFieldInstanceSerializer(serializers.ModelSerializer):
|
|||||||
"Value must be a list",
|
"Value must be a list",
|
||||||
)
|
)
|
||||||
doc_ids = data["value"]
|
doc_ids = data["value"]
|
||||||
if Document.objects.filter(id__in=doc_ids).count() != len(
|
request = self.context.get("request")
|
||||||
data["value"],
|
validate_documentlink_targets(
|
||||||
):
|
getattr(request, "user", None) if request is not None else None,
|
||||||
raise serializers.ValidationError(
|
doc_ids,
|
||||||
"Some documents in value don't exist or were specified twice.",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
@@ -1713,6 +1731,19 @@ class BulkEditSerializer(
|
|||||||
f"Some custom fields in {name} don't exist or were specified twice.",
|
f"Some custom fields in {name} don't exist or were specified twice.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if isinstance(custom_fields, dict):
|
||||||
|
custom_field_map = CustomField.objects.in_bulk(ids)
|
||||||
|
for raw_field_id, value in custom_fields.items():
|
||||||
|
field = custom_field_map.get(int(raw_field_id))
|
||||||
|
if (
|
||||||
|
field is not None
|
||||||
|
and field.data_type == CustomField.FieldDataType.DOCUMENTLINK
|
||||||
|
and value is not None
|
||||||
|
):
|
||||||
|
if not isinstance(value, list):
|
||||||
|
raise serializers.ValidationError("Value must be a list")
|
||||||
|
validate_documentlink_targets(self.user, value)
|
||||||
|
|
||||||
def validate_method(self, method):
|
def validate_method(self, method):
|
||||||
if method == "set_correspondent":
|
if method == "set_correspondent":
|
||||||
return bulk_edit.set_correspondent
|
return bulk_edit.set_correspondent
|
||||||
|
|||||||
@@ -2,5 +2,4 @@ from django.dispatch import Signal
|
|||||||
|
|
||||||
document_consumption_started = Signal()
|
document_consumption_started = Signal()
|
||||||
document_consumption_finished = Signal()
|
document_consumption_finished = Signal()
|
||||||
document_consumer_declaration = Signal()
|
|
||||||
document_updated = Signal()
|
document_updated = Signal()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -403,6 +404,14 @@ class CannotMoveFilesException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _path_matches_checksum(path: Path, checksum: str | None) -> bool:
|
||||||
|
if checksum is None or not path.is_file():
|
||||||
|
return False
|
||||||
|
|
||||||
|
with path.open("rb") as f:
|
||||||
|
return hashlib.md5(f.read()).hexdigest() == checksum
|
||||||
|
|
||||||
|
|
||||||
def _filename_template_uses_custom_fields(doc: Document) -> bool:
|
def _filename_template_uses_custom_fields(doc: Document) -> bool:
|
||||||
template = None
|
template = None
|
||||||
if doc.storage_path is not None:
|
if doc.storage_path is not None:
|
||||||
@@ -473,10 +482,12 @@ def update_filename_and_move_files(
|
|||||||
old_filename = instance.filename
|
old_filename = instance.filename
|
||||||
old_source_path = instance.source_path
|
old_source_path = instance.source_path
|
||||||
move_original = False
|
move_original = False
|
||||||
|
original_already_moved = False
|
||||||
|
|
||||||
old_archive_filename = instance.archive_filename
|
old_archive_filename = instance.archive_filename
|
||||||
old_archive_path = instance.archive_path
|
old_archive_path = instance.archive_path
|
||||||
move_archive = False
|
move_archive = False
|
||||||
|
archive_already_moved = False
|
||||||
|
|
||||||
candidate_filename = generate_filename(instance)
|
candidate_filename = generate_filename(instance)
|
||||||
if len(str(candidate_filename)) > Document.MAX_STORED_FILENAME_LENGTH:
|
if len(str(candidate_filename)) > Document.MAX_STORED_FILENAME_LENGTH:
|
||||||
@@ -497,6 +508,13 @@ def update_filename_and_move_files(
|
|||||||
candidate_source_path.exists()
|
candidate_source_path.exists()
|
||||||
and candidate_source_path != old_source_path
|
and candidate_source_path != old_source_path
|
||||||
):
|
):
|
||||||
|
if not old_source_path.is_file() and _path_matches_checksum(
|
||||||
|
candidate_source_path,
|
||||||
|
instance.checksum,
|
||||||
|
):
|
||||||
|
new_filename = candidate_filename
|
||||||
|
original_already_moved = True
|
||||||
|
else:
|
||||||
# Only fall back to unique search when there is an actual conflict
|
# Only fall back to unique search when there is an actual conflict
|
||||||
new_filename = generate_unique_filename(instance)
|
new_filename = generate_unique_filename(instance)
|
||||||
else:
|
else:
|
||||||
@@ -504,7 +522,9 @@ def update_filename_and_move_files(
|
|||||||
|
|
||||||
# Need to convert to string to be able to save it to the db
|
# Need to convert to string to be able to save it to the db
|
||||||
instance.filename = str(new_filename)
|
instance.filename = str(new_filename)
|
||||||
move_original = old_filename != instance.filename
|
move_original = (
|
||||||
|
old_filename != instance.filename and not original_already_moved
|
||||||
|
)
|
||||||
|
|
||||||
if instance.has_archive_version:
|
if instance.has_archive_version:
|
||||||
archive_candidate = generate_filename(instance, archive_filename=True)
|
archive_candidate = generate_filename(instance, archive_filename=True)
|
||||||
@@ -525,6 +545,13 @@ def update_filename_and_move_files(
|
|||||||
archive_candidate_path.exists()
|
archive_candidate_path.exists()
|
||||||
and archive_candidate_path != old_archive_path
|
and archive_candidate_path != old_archive_path
|
||||||
):
|
):
|
||||||
|
if not old_archive_path.is_file() and _path_matches_checksum(
|
||||||
|
archive_candidate_path,
|
||||||
|
instance.archive_checksum,
|
||||||
|
):
|
||||||
|
new_archive_filename = archive_candidate
|
||||||
|
archive_already_moved = True
|
||||||
|
else:
|
||||||
new_archive_filename = generate_unique_filename(
|
new_archive_filename = generate_unique_filename(
|
||||||
instance,
|
instance,
|
||||||
archive_filename=True,
|
archive_filename=True,
|
||||||
@@ -534,15 +561,22 @@ def update_filename_and_move_files(
|
|||||||
|
|
||||||
instance.archive_filename = str(new_archive_filename)
|
instance.archive_filename = str(new_archive_filename)
|
||||||
|
|
||||||
move_archive = old_archive_filename != instance.archive_filename
|
move_archive = (
|
||||||
|
old_archive_filename != instance.archive_filename
|
||||||
|
and not archive_already_moved
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
move_archive = False
|
move_archive = False
|
||||||
|
|
||||||
if not move_original and not move_archive:
|
if not move_original and not move_archive:
|
||||||
# Just update modified. Also, don't save() here to prevent infinite recursion.
|
updates = {"modified": timezone.now()}
|
||||||
Document.objects.filter(pk=instance.pk).update(
|
if old_filename != instance.filename:
|
||||||
modified=timezone.now(),
|
updates["filename"] = instance.filename
|
||||||
)
|
if old_archive_filename != instance.archive_filename:
|
||||||
|
updates["archive_filename"] = instance.archive_filename
|
||||||
|
|
||||||
|
# Don't save() here to prevent infinite recursion.
|
||||||
|
Document.objects.filter(pk=instance.pk).update(**updates)
|
||||||
return
|
return
|
||||||
|
|
||||||
if move_original:
|
if move_original:
|
||||||
@@ -932,8 +966,25 @@ def run_workflows(
|
|||||||
if not use_overrides:
|
if not use_overrides:
|
||||||
# limit title to 128 characters
|
# limit title to 128 characters
|
||||||
document.title = document.title[:128]
|
document.title = document.title[:128]
|
||||||
# save first before setting tags
|
# Save only the fields that workflow actions can set directly.
|
||||||
document.save()
|
# Deliberately excludes filename and archive_filename — those are
|
||||||
|
# managed exclusively by update_filename_and_move_files via the
|
||||||
|
# post_save signal. Writing stale in-memory values here would revert
|
||||||
|
# a concurrent update_filename_and_move_files DB write, leaving the
|
||||||
|
# DB pointing at the old path while the file is already at the new
|
||||||
|
# one (see: https://github.com/paperless-ngx/paperless-ngx/issues/12386).
|
||||||
|
# modified has auto_now=True but is not auto-added when update_fields
|
||||||
|
# is specified, so it must be listed explicitly.
|
||||||
|
document.save(
|
||||||
|
update_fields=[
|
||||||
|
"title",
|
||||||
|
"correspondent",
|
||||||
|
"document_type",
|
||||||
|
"storage_path",
|
||||||
|
"owner",
|
||||||
|
"modified",
|
||||||
|
],
|
||||||
|
)
|
||||||
document.tags.set(doc_tag_ids)
|
document.tags.set(doc_tag_ids)
|
||||||
|
|
||||||
WorkflowRun.objects.create(
|
WorkflowRun.objects.create(
|
||||||
|
|||||||
@@ -52,8 +52,6 @@ from documents.models import StoragePath
|
|||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import WorkflowRun
|
from documents.models import WorkflowRun
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
from documents.plugins.base import ProgressManager
|
from documents.plugins.base import ProgressManager
|
||||||
from documents.plugins.base import StopConsumeTaskError
|
from documents.plugins.base import StopConsumeTaskError
|
||||||
@@ -65,6 +63,8 @@ from documents.signals.handlers import run_workflows
|
|||||||
from documents.signals.handlers import send_websocket_document_updated
|
from documents.signals.handlers import send_websocket_document_updated
|
||||||
from documents.workflows.utils import get_workflows_for_trigger
|
from documents.workflows.utils import get_workflows_for_trigger
|
||||||
from paperless.config import AIConfig
|
from paperless.config import AIConfig
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||||
from paperless_ai.indexing import llm_index_remove_document
|
from paperless_ai.indexing import llm_index_remove_document
|
||||||
from paperless_ai.indexing import update_llm_index
|
from paperless_ai.indexing import update_llm_index
|
||||||
@@ -100,7 +100,11 @@ def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
|
|||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def train_classifier(*, scheduled=True) -> None:
|
def train_classifier(
|
||||||
|
*,
|
||||||
|
scheduled=True,
|
||||||
|
status_callback: Callable[[str], None] | None = None,
|
||||||
|
) -> None:
|
||||||
task = PaperlessTask.objects.create(
|
task = PaperlessTask.objects.create(
|
||||||
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
||||||
if scheduled
|
if scheduled
|
||||||
@@ -136,7 +140,7 @@ def train_classifier(*, scheduled=True) -> None:
|
|||||||
classifier = DocumentClassifier()
|
classifier = DocumentClassifier()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if classifier.train():
|
if classifier.train(status_callback=status_callback):
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Saving updated classifier model to {settings.MODEL_FILE}...",
|
f"Saving updated classifier model to {settings.MODEL_FILE}...",
|
||||||
)
|
)
|
||||||
@@ -300,7 +304,11 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
|
|
||||||
mime_type = document.mime_type
|
mime_type = document.mime_type
|
||||||
|
|
||||||
parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
|
mime_type,
|
||||||
|
document.original_filename or "",
|
||||||
|
document.source_path,
|
||||||
|
)
|
||||||
|
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
logger.error(
|
logger.error(
|
||||||
@@ -309,16 +317,13 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
with parser_class() as parser:
|
||||||
|
parser.configure(ParserContext())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
parser.parse(document.source_path, mime_type)
|
||||||
|
|
||||||
thumbnail = parser.get_thumbnail(
|
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||||
document.source_path,
|
|
||||||
mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
oldDocument = Document.objects.get(pk=document.pk)
|
oldDocument = Document.objects.get(pk=document.pk)
|
||||||
@@ -398,9 +403,6 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
logger.exception(
|
logger.exception(
|
||||||
f"Error while parsing document {document} (ID: {document_id})",
|
f"Error while parsing document {document} (ID: {document_id})",
|
||||||
)
|
)
|
||||||
finally:
|
|
||||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
|
|||||||
@@ -163,13 +163,23 @@ class TestRenderResultsSummary:
|
|||||||
class TestDocumentSanityCheckerCommand:
|
class TestDocumentSanityCheckerCommand:
|
||||||
def test_no_issues(self, sample_doc: Document) -> None:
|
def test_no_issues(self, sample_doc: Document) -> None:
|
||||||
out = StringIO()
|
out = StringIO()
|
||||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
call_command(
|
||||||
|
"document_sanity_checker",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
assert "No issues detected" in out.getvalue()
|
assert "No issues detected" in out.getvalue()
|
||||||
|
|
||||||
def test_missing_original(self, sample_doc: Document) -> None:
|
def test_missing_original(self, sample_doc: Document) -> None:
|
||||||
Path(sample_doc.source_path).unlink()
|
Path(sample_doc.source_path).unlink()
|
||||||
out = StringIO()
|
out = StringIO()
|
||||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
call_command(
|
||||||
|
"document_sanity_checker",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
output = out.getvalue()
|
output = out.getvalue()
|
||||||
assert "ERROR" in output
|
assert "ERROR" in output
|
||||||
assert "Original of document does not exist" in output
|
assert "Original of document does not exist" in output
|
||||||
@@ -187,7 +197,12 @@ class TestDocumentSanityCheckerCommand:
|
|||||||
Path(doc.thumbnail_path).touch()
|
Path(doc.thumbnail_path).touch()
|
||||||
|
|
||||||
out = StringIO()
|
out = StringIO()
|
||||||
call_command("document_sanity_checker", "--no-progress-bar", stdout=out)
|
call_command(
|
||||||
|
"document_sanity_checker",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
output = out.getvalue()
|
output = out.getvalue()
|
||||||
assert "ERROR" in output
|
assert "ERROR" in output
|
||||||
assert "Checksum mismatch. Stored: abc, actual:" in output
|
assert "Checksum mismatch. Stored: abc, actual:" in output
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||||
|
from django.test import override_settings
|
||||||
from rest_framework import status
|
from rest_framework import status
|
||||||
from rest_framework.test import APITestCase
|
from rest_framework.test import APITestCase
|
||||||
|
|
||||||
@@ -693,3 +694,17 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
|||||||
content_type="application/json",
|
content_type="application/json",
|
||||||
)
|
)
|
||||||
mock_update.assert_called_once()
|
mock_update.assert_called_once()
|
||||||
|
|
||||||
|
@override_settings(LLM_ALLOW_INTERNAL_ENDPOINTS=False)
|
||||||
|
def test_update_llm_endpoint_blocks_internal_endpoint_when_disallowed(self) -> None:
|
||||||
|
response = self.client.patch(
|
||||||
|
f"{self.ENDPOINT}1/",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"llm_endpoint": "http://127.0.0.1:11434",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||||
|
self.assertIn("non-public address", str(response.data).lower())
|
||||||
|
|||||||
@@ -262,6 +262,50 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase):
|
|||||||
self.assertEqual(kwargs["add_custom_fields"], [self.cf1.id])
|
self.assertEqual(kwargs["add_custom_fields"], [self.cf1.id])
|
||||||
self.assertEqual(kwargs["remove_custom_fields"], [self.cf2.id])
|
self.assertEqual(kwargs["remove_custom_fields"], [self.cf2.id])
|
||||||
|
|
||||||
|
@mock.patch("documents.serialisers.bulk_edit.modify_custom_fields")
|
||||||
|
def test_api_modify_custom_fields_documentlink_forbidden_for_unpermitted_target(
|
||||||
|
self,
|
||||||
|
m,
|
||||||
|
) -> None:
|
||||||
|
self.setup_mock(m, "modify_custom_fields")
|
||||||
|
user = User.objects.create_user(username="doc-owner")
|
||||||
|
user.user_permissions.add(Permission.objects.get(codename="change_document"))
|
||||||
|
other_user = User.objects.create_user(username="other-user")
|
||||||
|
source_doc = Document.objects.create(
|
||||||
|
checksum="source",
|
||||||
|
title="Source",
|
||||||
|
owner=user,
|
||||||
|
)
|
||||||
|
target_doc = Document.objects.create(
|
||||||
|
checksum="target",
|
||||||
|
title="Target",
|
||||||
|
owner=other_user,
|
||||||
|
)
|
||||||
|
doclink_field = CustomField.objects.create(
|
||||||
|
name="doclink",
|
||||||
|
data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user=user)
|
||||||
|
|
||||||
|
response = self.client.post(
|
||||||
|
"/api/documents/bulk_edit/",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"documents": [source_doc.id],
|
||||||
|
"method": "modify_custom_fields",
|
||||||
|
"parameters": {
|
||||||
|
"add_custom_fields": {doclink_field.id: [target_doc.id]},
|
||||||
|
"remove_custom_fields": [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
m.assert_not_called()
|
||||||
|
|
||||||
@mock.patch("documents.serialisers.bulk_edit.modify_custom_fields")
|
@mock.patch("documents.serialisers.bulk_edit.modify_custom_fields")
|
||||||
def test_api_modify_custom_fields_with_values(self, m) -> None:
|
def test_api_modify_custom_fields_with_values(self, m) -> None:
|
||||||
self.setup_mock(m, "modify_custom_fields")
|
self.setup_mock(m, "modify_custom_fields")
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from unittest.mock import ANY
|
|||||||
from django.contrib.auth.models import Permission
|
from django.contrib.auth.models import Permission
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
|
from guardian.shortcuts import assign_perm
|
||||||
from rest_framework import status
|
from rest_framework import status
|
||||||
from rest_framework.test import APITestCase
|
from rest_framework.test import APITestCase
|
||||||
|
|
||||||
@@ -1140,6 +1141,102 @@ class TestCustomFieldsAPI(DirectoriesMixin, APITestCase):
|
|||||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||||
self.assertEqual(doc5.custom_fields.first().value, [1])
|
self.assertEqual(doc5.custom_fields.first().value, [1])
|
||||||
|
|
||||||
|
def test_documentlink_patch_requires_change_permission_on_target_documents(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
source_owner = User.objects.create_user(username="source-owner")
|
||||||
|
source_owner.user_permissions.add(
|
||||||
|
Permission.objects.get(codename="change_document"),
|
||||||
|
)
|
||||||
|
other_user = User.objects.create_user(username="other-user")
|
||||||
|
|
||||||
|
source_doc = Document.objects.create(
|
||||||
|
title="Source",
|
||||||
|
checksum="source",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
owner=source_owner,
|
||||||
|
)
|
||||||
|
target_doc = Document.objects.create(
|
||||||
|
title="Target",
|
||||||
|
checksum="target",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
owner=other_user,
|
||||||
|
)
|
||||||
|
custom_field_doclink = CustomField.objects.create(
|
||||||
|
name="Test Custom Field Doc Link",
|
||||||
|
data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user=source_owner)
|
||||||
|
|
||||||
|
resp = self.client.patch(
|
||||||
|
f"/api/documents/{source_doc.id}/",
|
||||||
|
data={
|
||||||
|
"custom_fields": [
|
||||||
|
{
|
||||||
|
"field": custom_field_doclink.id,
|
||||||
|
"value": [target_doc.id],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
format="json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
self.assertEqual(
|
||||||
|
CustomFieldInstance.objects.filter(field=custom_field_doclink).count(),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_documentlink_patch_allowed_with_change_permission_on_target_documents(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
source_owner = User.objects.create_user(username="source-owner")
|
||||||
|
source_owner.user_permissions.add(
|
||||||
|
Permission.objects.get(codename="change_document"),
|
||||||
|
)
|
||||||
|
other_user = User.objects.create_user(username="other-user")
|
||||||
|
|
||||||
|
source_doc = Document.objects.create(
|
||||||
|
title="Source",
|
||||||
|
checksum="source",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
owner=source_owner,
|
||||||
|
)
|
||||||
|
target_doc = Document.objects.create(
|
||||||
|
title="Target",
|
||||||
|
checksum="target",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
owner=other_user,
|
||||||
|
)
|
||||||
|
custom_field_doclink = CustomField.objects.create(
|
||||||
|
name="Test Custom Field Doc Link",
|
||||||
|
data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_perm("change_document", source_owner, target_doc)
|
||||||
|
self.client.force_authenticate(user=source_owner)
|
||||||
|
|
||||||
|
resp = self.client.patch(
|
||||||
|
f"/api/documents/{source_doc.id}/",
|
||||||
|
data={
|
||||||
|
"custom_fields": [
|
||||||
|
{
|
||||||
|
"field": custom_field_doclink.id,
|
||||||
|
"value": [target_doc.id],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
format="json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||||
|
target_doc.refresh_from_db()
|
||||||
|
self.assertEqual(
|
||||||
|
target_doc.custom_fields.get(field=custom_field_doclink).value,
|
||||||
|
[source_doc.id],
|
||||||
|
)
|
||||||
|
|
||||||
def test_custom_field_filters(self) -> None:
|
def test_custom_field_filters(self) -> None:
|
||||||
custom_field_string = CustomField.objects.create(
|
custom_field_string = CustomField.objects.create(
|
||||||
name="Test Custom Field String",
|
name="Test Custom Field String",
|
||||||
|
|||||||
@@ -888,6 +888,19 @@ class TestApiUser(DirectoriesMixin, APITestCase):
|
|||||||
|
|
||||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
|
response = self.client.post(
|
||||||
|
f"{self.ENDPOINT}",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"username": "user4",
|
||||||
|
"is_superuser": "true",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
self.client.force_authenticate(user2)
|
self.client.force_authenticate(user2)
|
||||||
|
|
||||||
response = self.client.patch(
|
response = self.client.patch(
|
||||||
@@ -920,6 +933,65 @@ class TestApiUser(DirectoriesMixin, APITestCase):
|
|||||||
returned_user1 = User.objects.get(pk=user1.pk)
|
returned_user1 = User.objects.get(pk=user1.pk)
|
||||||
self.assertEqual(returned_user1.is_superuser, False)
|
self.assertEqual(returned_user1.is_superuser, False)
|
||||||
|
|
||||||
|
def test_only_superusers_can_create_or_alter_staff_status(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Existing user account
|
||||||
|
WHEN:
|
||||||
|
- API request is made to add a user account with staff status
|
||||||
|
- API request is made to change staff status
|
||||||
|
THEN:
|
||||||
|
- Only superusers can change staff status
|
||||||
|
"""
|
||||||
|
|
||||||
|
user1 = User.objects.create_user(username="user1")
|
||||||
|
user1.user_permissions.add(*Permission.objects.all())
|
||||||
|
user2 = User.objects.create_superuser(username="user2")
|
||||||
|
|
||||||
|
self.client.force_authenticate(user1)
|
||||||
|
|
||||||
|
response = self.client.patch(
|
||||||
|
f"{self.ENDPOINT}{user1.pk}/",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"is_staff": "true",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
|
response = self.client.post(
|
||||||
|
f"{self.ENDPOINT}",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"username": "user3",
|
||||||
|
"is_staff": 1,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user2)
|
||||||
|
|
||||||
|
response = self.client.patch(
|
||||||
|
f"{self.ENDPOINT}{user1.pk}/",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"is_staff": True,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
|
||||||
|
returned_user1 = User.objects.get(pk=user1.pk)
|
||||||
|
self.assertEqual(returned_user1.is_staff, True)
|
||||||
|
|
||||||
|
|
||||||
class TestApiGroup(DirectoriesMixin, APITestCase):
|
class TestApiGroup(DirectoriesMixin, APITestCase):
|
||||||
ENDPOINT = "/api/groups/"
|
ENDPOINT = "/api/groups/"
|
||||||
|
|||||||
@@ -12,7 +12,12 @@ class TestApiSchema(APITestCase):
|
|||||||
Test that the schema is valid
|
Test that the schema is valid
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
call_command("spectacular", "--validate", "--fail-on-warn")
|
call_command(
|
||||||
|
"spectacular",
|
||||||
|
"--validate",
|
||||||
|
"--fail-on-warn",
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
except CommandError as e:
|
except CommandError as e:
|
||||||
self.fail(f"Schema validation failed: {e}")
|
self.fail(f"Schema validation failed: {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -702,6 +702,40 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
|||||||
|
|
||||||
self.assertEqual(correction, None)
|
self.assertEqual(correction, None)
|
||||||
|
|
||||||
|
def test_search_spelling_suggestion_suppressed_for_private_terms(self):
|
||||||
|
owner = User.objects.create_user("owner")
|
||||||
|
attacker = User.objects.create_user("attacker")
|
||||||
|
attacker.user_permissions.add(
|
||||||
|
Permission.objects.get(codename="view_document"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with AsyncWriter(index.open_index()) as writer:
|
||||||
|
for i in range(55):
|
||||||
|
private_doc = Document.objects.create(
|
||||||
|
checksum=f"p{i}",
|
||||||
|
pk=100 + i,
|
||||||
|
title=f"Private Document {i + 1}",
|
||||||
|
content=f"treasury document {i + 1}",
|
||||||
|
owner=owner,
|
||||||
|
)
|
||||||
|
visible_doc = Document.objects.create(
|
||||||
|
checksum=f"v{i}",
|
||||||
|
pk=200 + i,
|
||||||
|
title=f"Visible Document {i + 1}",
|
||||||
|
content=f"public ledger {i + 1}",
|
||||||
|
owner=attacker,
|
||||||
|
)
|
||||||
|
index.update_document(writer, private_doc)
|
||||||
|
index.update_document(writer, visible_doc)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user=attacker)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?query=treasurx")
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
self.assertEqual(response.data["count"], 0)
|
||||||
|
self.assertIsNone(response.data["corrected_query"])
|
||||||
|
|
||||||
@mock.patch(
|
@mock.patch(
|
||||||
"whoosh.searching.Searcher.correct_query",
|
"whoosh.searching.Searcher.correct_query",
|
||||||
side_effect=Exception("Test error"),
|
side_effect=Exception("Test error"),
|
||||||
@@ -772,6 +806,60 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
|||||||
self.assertEqual(results[0]["id"], d3.id)
|
self.assertEqual(results[0]["id"], d3.id)
|
||||||
self.assertEqual(results[1]["id"], d1.id)
|
self.assertEqual(results[1]["id"], d1.id)
|
||||||
|
|
||||||
|
def test_search_more_like_requires_view_permission_on_seed_document(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- A user can search documents they own
|
||||||
|
- Another user's private document exists with similar content
|
||||||
|
WHEN:
|
||||||
|
- The user requests more-like-this for the private seed document
|
||||||
|
THEN:
|
||||||
|
- The request is rejected
|
||||||
|
"""
|
||||||
|
owner = User.objects.create_user("owner")
|
||||||
|
attacker = User.objects.create_user("attacker")
|
||||||
|
attacker.user_permissions.add(
|
||||||
|
Permission.objects.get(codename="view_document"),
|
||||||
|
)
|
||||||
|
|
||||||
|
private_seed = Document.objects.create(
|
||||||
|
title="private bank statement",
|
||||||
|
content="quarterly treasury bank statement wire transfer",
|
||||||
|
checksum="seed",
|
||||||
|
owner=owner,
|
||||||
|
pk=10,
|
||||||
|
)
|
||||||
|
visible_doc = Document.objects.create(
|
||||||
|
title="attacker-visible match",
|
||||||
|
content="quarterly treasury bank statement wire transfer summary",
|
||||||
|
checksum="visible",
|
||||||
|
owner=attacker,
|
||||||
|
pk=11,
|
||||||
|
)
|
||||||
|
other_doc = Document.objects.create(
|
||||||
|
title="unrelated",
|
||||||
|
content="completely different topic",
|
||||||
|
checksum="other",
|
||||||
|
owner=attacker,
|
||||||
|
pk=12,
|
||||||
|
)
|
||||||
|
|
||||||
|
with AsyncWriter(index.open_index()) as writer:
|
||||||
|
index.update_document(writer, private_seed)
|
||||||
|
index.update_document(writer, visible_doc)
|
||||||
|
index.update_document(writer, other_doc)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user=attacker)
|
||||||
|
|
||||||
|
response = self.client.get(
|
||||||
|
f"/api/documents/?more_like_id={private_seed.id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
self.assertEqual(response.content, b"Insufficient permissions.")
|
||||||
|
|
||||||
def test_search_filtering(self) -> None:
|
def test_search_filtering(self) -> None:
|
||||||
t = Tag.objects.create(name="tag")
|
t = Tag.objects.create(name="tag")
|
||||||
t2 = Tag.objects.create(name="tag2")
|
t2 = Tag.objects.create(name="tag2")
|
||||||
@@ -1356,6 +1444,83 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
|||||||
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
|
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
|
||||||
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
|
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
|
||||||
|
|
||||||
|
def test_global_search_filters_owned_mail_objects(self) -> None:
|
||||||
|
user1 = User.objects.create_user("mail-search-user")
|
||||||
|
user2 = User.objects.create_user("other-mail-search-user")
|
||||||
|
user1.user_permissions.add(
|
||||||
|
Permission.objects.get(codename="view_mailaccount"),
|
||||||
|
Permission.objects.get(codename="view_mailrule"),
|
||||||
|
)
|
||||||
|
|
||||||
|
own_account = MailAccount.objects.create(
|
||||||
|
name="bank owned account",
|
||||||
|
username="owner@example.com",
|
||||||
|
password="secret",
|
||||||
|
imap_server="imap.owner.example.com",
|
||||||
|
imap_port=993,
|
||||||
|
imap_security=MailAccount.ImapSecurity.SSL,
|
||||||
|
character_set="UTF-8",
|
||||||
|
owner=user1,
|
||||||
|
)
|
||||||
|
other_account = MailAccount.objects.create(
|
||||||
|
name="bank other account",
|
||||||
|
username="other@example.com",
|
||||||
|
password="secret",
|
||||||
|
imap_server="imap.other.example.com",
|
||||||
|
imap_port=993,
|
||||||
|
imap_security=MailAccount.ImapSecurity.SSL,
|
||||||
|
character_set="UTF-8",
|
||||||
|
owner=user2,
|
||||||
|
)
|
||||||
|
unowned_account = MailAccount.objects.create(
|
||||||
|
name="bank shared account",
|
||||||
|
username="shared@example.com",
|
||||||
|
password="secret",
|
||||||
|
imap_server="imap.shared.example.com",
|
||||||
|
imap_port=993,
|
||||||
|
imap_security=MailAccount.ImapSecurity.SSL,
|
||||||
|
character_set="UTF-8",
|
||||||
|
)
|
||||||
|
own_rule = MailRule.objects.create(
|
||||||
|
name="bank owned rule",
|
||||||
|
account=own_account,
|
||||||
|
action=MailRule.MailAction.MOVE,
|
||||||
|
owner=user1,
|
||||||
|
)
|
||||||
|
other_rule = MailRule.objects.create(
|
||||||
|
name="bank other rule",
|
||||||
|
account=other_account,
|
||||||
|
action=MailRule.MailAction.MOVE,
|
||||||
|
owner=user2,
|
||||||
|
)
|
||||||
|
unowned_rule = MailRule.objects.create(
|
||||||
|
name="bank shared rule",
|
||||||
|
account=unowned_account,
|
||||||
|
action=MailRule.MailAction.MOVE,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.client.force_authenticate(user1)
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/?query=bank")
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
self.assertCountEqual(
|
||||||
|
[account["id"] for account in response.data["mail_accounts"]],
|
||||||
|
[own_account.id, unowned_account.id],
|
||||||
|
)
|
||||||
|
self.assertCountEqual(
|
||||||
|
[rule["id"] for rule in response.data["mail_rules"]],
|
||||||
|
[own_rule.id, unowned_rule.id],
|
||||||
|
)
|
||||||
|
self.assertNotIn(
|
||||||
|
other_account.id,
|
||||||
|
[account["id"] for account in response.data["mail_accounts"]],
|
||||||
|
)
|
||||||
|
self.assertNotIn(
|
||||||
|
other_rule.id,
|
||||||
|
[rule["id"] for rule in response.data["mail_rules"]],
|
||||||
|
)
|
||||||
|
|
||||||
def test_global_search_bad_request(self) -> None:
|
def test_global_search_bad_request(self) -> None:
|
||||||
"""
|
"""
|
||||||
WHEN:
|
WHEN:
|
||||||
|
|||||||
@@ -26,6 +26,23 @@ class TestSystemStatus(APITestCase):
|
|||||||
self.override = override_settings(MEDIA_ROOT=self.tmp_dir)
|
self.override = override_settings(MEDIA_ROOT=self.tmp_dir)
|
||||||
self.override.enable()
|
self.override.enable()
|
||||||
|
|
||||||
|
# Mock slow network calls so tests don't block on real Redis/Celery timeouts.
|
||||||
|
# Individual tests that care about specific behaviour override these with
|
||||||
|
# their own @mock.patch decorators (which take precedence).
|
||||||
|
redis_patcher = mock.patch(
|
||||||
|
"redis.Redis.execute_command",
|
||||||
|
side_effect=Exception("Redis not available"),
|
||||||
|
)
|
||||||
|
self.mock_redis = redis_patcher.start()
|
||||||
|
self.addCleanup(redis_patcher.stop)
|
||||||
|
|
||||||
|
celery_patcher = mock.patch(
|
||||||
|
"celery.app.control.Inspect.ping",
|
||||||
|
side_effect=Exception("Celery not available"),
|
||||||
|
)
|
||||||
|
self.mock_celery_ping = celery_patcher.start()
|
||||||
|
self.addCleanup(celery_patcher.stop)
|
||||||
|
|
||||||
def tearDown(self) -> None:
|
def tearDown(self) -> None:
|
||||||
super().tearDown()
|
super().tearDown()
|
||||||
|
|
||||||
@@ -69,11 +86,18 @@ class TestSystemStatus(APITestCase):
|
|||||||
"""
|
"""
|
||||||
response = self.client.get(self.ENDPOINT)
|
response = self.client.get(self.ENDPOINT)
|
||||||
self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED)
|
self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED)
|
||||||
|
self.assertEqual(response["WWW-Authenticate"], "Token")
|
||||||
normal_user = User.objects.create_user(username="normal_user")
|
normal_user = User.objects.create_user(username="normal_user")
|
||||||
self.client.force_login(normal_user)
|
self.client.force_login(normal_user)
|
||||||
response = self.client.get(self.ENDPOINT)
|
response = self.client.get(self.ENDPOINT)
|
||||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
|
def test_system_status_with_bad_basic_auth_challenges(self) -> None:
|
||||||
|
self.client.credentials(HTTP_AUTHORIZATION="Basic invalid")
|
||||||
|
response = self.client.get(self.ENDPOINT)
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED)
|
||||||
|
self.assertEqual(response["WWW-Authenticate"], 'Basic realm="api"')
|
||||||
|
|
||||||
def test_system_status_container_detection(self) -> None:
|
def test_system_status_container_detection(self) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -84,11 +108,15 @@ class TestSystemStatus(APITestCase):
|
|||||||
- The response contains the correct install type
|
- The response contains the correct install type
|
||||||
"""
|
"""
|
||||||
self.client.force_login(self.user)
|
self.client.force_login(self.user)
|
||||||
os.environ["PNGX_CONTAINERIZED"] = "1"
|
with mock.patch.dict(os.environ, {"PNGX_CONTAINERIZED": "1"}, clear=False):
|
||||||
response = self.client.get(self.ENDPOINT)
|
response = self.client.get(self.ENDPOINT)
|
||||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
self.assertEqual(response.data["install_type"], "docker")
|
self.assertEqual(response.data["install_type"], "docker")
|
||||||
os.environ["KUBERNETES_SERVICE_HOST"] = "http://localhost"
|
with mock.patch.dict(
|
||||||
|
os.environ,
|
||||||
|
{"PNGX_CONTAINERIZED": "1", "KUBERNETES_SERVICE_HOST": "http://localhost"},
|
||||||
|
clear=False,
|
||||||
|
):
|
||||||
response = self.client.get(self.ENDPOINT)
|
response = self.client.get(self.ENDPOINT)
|
||||||
self.assertEqual(response.data["install_type"], "kubernetes")
|
self.assertEqual(response.data["install_type"], "kubernetes")
|
||||||
|
|
||||||
|
|||||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
|||||||
def test_parser_check(self) -> None:
|
def test_parser_check(self) -> None:
|
||||||
self.assertEqual(parser_check(None), [])
|
self.assertEqual(parser_check(None), [])
|
||||||
|
|
||||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||||
m.return_value = []
|
mock_registry = mock.MagicMock()
|
||||||
|
mock_registry.all_parsers.return_value = []
|
||||||
|
mock_registry_fn.return_value = mock_registry
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
parser_check(None),
|
parser_check(None),
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
|||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.plugins.helpers import ProgressStatusOptions
|
from documents.plugins.helpers import ProgressStatusOptions
|
||||||
from documents.tasks import sanity_check
|
from documents.tasks import sanity_check
|
||||||
@@ -36,65 +35,108 @@ from documents.tests.utils import DummyProgressManager
|
|||||||
from documents.tests.utils import FileSystemAssertsMixin
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
from documents.tests.utils import GetConsumerMixin
|
from documents.tests.utils import GetConsumerMixin
|
||||||
from paperless_mail.models import MailRule
|
from paperless_mail.models import MailRule
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseTestParser(DocumentParser):
|
class _BaseNewStyleParser:
|
||||||
def get_settings(self) -> None:
|
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||||
|
|
||||||
|
name: str = "test-parser"
|
||||||
|
version: str = "0.1"
|
||||||
|
author: str = "test"
|
||||||
|
url: str = "test"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict:
|
||||||
|
return {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/png": ".png",
|
||||||
|
"message/rfc822": ".eml",
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type: str, filename: str, path=None):
|
||||||
|
return 0 if mime_type in cls.supported_mime_types() else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._tmpdir: Path | None = None
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive: Path | None = None
|
||||||
|
self._thumb: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self._tmpdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||||
|
self._thumb = Path(thumb)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
if self._tmpdir and self._tmpdir.exists():
|
||||||
|
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||||
|
|
||||||
|
def configure(self, context) -> None:
|
||||||
"""
|
"""
|
||||||
This parser does not implement additional settings yet
|
Test parser doesn't do anything with context
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self):
|
||||||
|
return self._archive
|
||||||
|
|
||||||
class DummyParser(_BaseTestParser):
|
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
return self._thumb
|
||||||
super().__init__(logging_group, None)
|
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
|
||||||
self.archive_path = archive_path
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
def get_page_count(self, document_path, mime_type):
|
||||||
return self.fake_thumb
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
def extract_metadata(self, document_path, mime_type) -> list:
|
||||||
self.text = "The Text"
|
return []
|
||||||
|
|
||||||
|
|
||||||
class CopyParser(_BaseTestParser):
|
class DummyParser(_BaseNewStyleParser):
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
_ARCHIVE_SRC = (
|
||||||
return self.fake_thumb
|
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group, progress_callback)
|
self._text = "The Text"
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
if produce_archive and self._tmpdir:
|
||||||
|
self._archive = self._tmpdir / "archive.pdf"
|
||||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||||
self.text = "The text"
|
|
||||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
|
||||||
shutil.copy(document_path, self.archive_path)
|
|
||||||
|
|
||||||
|
|
||||||
class FaultyParser(_BaseTestParser):
|
class CopyParser(_BaseNewStyleParser):
|
||||||
def __init__(self, logging_group, scratch_dir) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group)
|
self._text = "The text"
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
if produce_archive and self._tmpdir:
|
||||||
|
self._archive = self._tmpdir / "archive.pdf"
|
||||||
|
shutil.copy(document_path, self._archive)
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
||||||
return self.fake_thumb
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
class FaultyParser(_BaseNewStyleParser):
|
||||||
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
raise ParseError("Does not compute.")
|
raise ParseError("Does not compute.")
|
||||||
|
|
||||||
|
|
||||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||||
def __init__(self, logging_group, scratch_dir) -> None:
|
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||||
super().__init__(logging_group)
|
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
||||||
return self.fake_thumb
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
|
||||||
raise Exception("Generic exception.")
|
raise Exception("Generic exception.")
|
||||||
|
|
||||||
|
|
||||||
@@ -148,38 +190,12 @@ class TestConsumer(
|
|||||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||||
self.assertEqual(payload["data"]["status"], last_status)
|
self.assertEqual(payload["data"]["status"], last_status)
|
||||||
|
|
||||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
|
||||||
return DummyParser(
|
|
||||||
logging_group,
|
|
||||||
self.dirs.scratch_dir,
|
|
||||||
self.get_test_archive_file(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
|
||||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
|
||||||
|
|
||||||
def make_faulty_generic_exception_parser(
|
|
||||||
self,
|
|
||||||
logging_group,
|
|
||||||
progress_callback=None,
|
|
||||||
):
|
|
||||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
|
||||||
|
|
||||||
def setUp(self) -> None:
|
def setUp(self) -> None:
|
||||||
super().setUp()
|
super().setUp()
|
||||||
|
|
||||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
patcher = mock.patch("documents.consumer.get_parser_registry")
|
||||||
m = patcher.start()
|
mock_registry = patcher.start()
|
||||||
m.return_value = [
|
mock_registry.return_value.get_parser_for_file.return_value = DummyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_dummy_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
self.addCleanup(patcher.stop)
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
def get_test_file(self):
|
def get_test_file(self):
|
||||||
@@ -548,9 +564,9 @@ class TestConsumer(
|
|||||||
) as consumer:
|
) as consumer:
|
||||||
consumer.run()
|
consumer.run()
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testNoParsers(self, m) -> None:
|
def testNoParsers(self, m) -> None:
|
||||||
m.return_value = []
|
m.return_value.get_parser_for_file.return_value = None
|
||||||
|
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
ConsumerError,
|
ConsumerError,
|
||||||
@@ -561,18 +577,9 @@ class TestConsumer(
|
|||||||
|
|
||||||
self._assert_first_last_send_progress(last_status="FAILED")
|
self._assert_first_last_send_progress(last_status="FAILED")
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testFaultyParser(self, m) -> None:
|
def testFaultyParser(self, m) -> None:
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = FaultyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_faulty_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(self.get_test_file()) as consumer:
|
with self.get_consumer(self.get_test_file()) as consumer:
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
@@ -583,18 +590,9 @@ class TestConsumer(
|
|||||||
|
|
||||||
self._assert_first_last_send_progress(last_status="FAILED")
|
self._assert_first_last_send_progress(last_status="FAILED")
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def testGenericParserException(self, m) -> None:
|
def testGenericParserException(self, m) -> None:
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": self.make_faulty_generic_exception_parser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(self.get_test_file()) as consumer:
|
with self.get_consumer(self.get_test_file()) as consumer:
|
||||||
with self.assertRaisesMessage(
|
with self.assertRaisesMessage(
|
||||||
@@ -642,6 +640,7 @@ class TestConsumer(
|
|||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@mock.patch("documents.consumer.generate_unique_filename")
|
@mock.patch("documents.consumer.generate_unique_filename")
|
||||||
|
@override_settings(FILENAME_FORMAT="{pk}")
|
||||||
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
||||||
m.side_effect = lambda doc, archive_filename=False: Path(
|
m.side_effect = lambda doc, archive_filename=False: Path(
|
||||||
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
||||||
@@ -1017,7 +1016,7 @@ class TestConsumer(
|
|||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@override_settings(FILENAME_FORMAT="{title}")
|
@override_settings(FILENAME_FORMAT="{title}")
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def test_similar_filenames(self, m) -> None:
|
def test_similar_filenames(self, m) -> None:
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
Path(__file__).parent / "samples" / "simple.pdf",
|
Path(__file__).parent / "samples" / "simple.pdf",
|
||||||
@@ -1031,16 +1030,7 @@ class TestConsumer(
|
|||||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||||
)
|
)
|
||||||
m.return_value = [
|
m.return_value.get_parser_for_file.return_value = CopyParser
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"parser": CopyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||||
consumer.run()
|
consumer.run()
|
||||||
@@ -1068,8 +1058,10 @@ class TestConsumer(
|
|||||||
|
|
||||||
sanity_check()
|
sanity_check()
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
@mock.patch("documents.consumer.run_subprocess")
|
@mock.patch("documents.consumer.run_subprocess")
|
||||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
|
||||||
|
mock_registry.return_value.get_parser_for_file.return_value = None
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||||
@@ -1090,11 +1082,11 @@ class TestConsumer(
|
|||||||
self.assertEqual(command[1], "--replace-input")
|
self.assertEqual(command[1], "--replace-input")
|
||||||
|
|
||||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
|
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def test_mail_parser_receives_mailrule(
|
def test_mail_parser_receives_mailrule(
|
||||||
self,
|
self,
|
||||||
mock_consumer_declaration_send: mock.Mock,
|
mock_get_parser_registry: mock.Mock,
|
||||||
mock_mail_parser_parse: mock.Mock,
|
mock_mail_parser_parse: mock.Mock,
|
||||||
mock_mailrule_get: mock.Mock,
|
mock_mailrule_get: mock.Mock,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -1106,25 +1098,21 @@ class TestConsumer(
|
|||||||
THEN:
|
THEN:
|
||||||
- The mail parser should receive the mail rule
|
- The mail parser should receive the mail rule
|
||||||
"""
|
"""
|
||||||
mock_consumer_declaration_send.return_value = [
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
(
|
|
||||||
None,
|
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||||
{
|
MailDocumentParser
|
||||||
"parser": MailDocumentParser,
|
)
|
||||||
"mime_types": {"message/rfc822": ".eml"},
|
|
||||||
"weight": 0,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
mock_mailrule_get.return_value = mock.Mock(
|
mock_mailrule_get.return_value = mock.Mock(
|
||||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||||
)
|
)
|
||||||
with self.get_consumer(
|
with self.get_consumer(
|
||||||
filepath=(
|
filepath=(
|
||||||
Path(__file__).parent.parent.parent
|
Path(__file__).parent.parent.parent
|
||||||
/ Path("paperless_mail")
|
/ Path("paperless")
|
||||||
/ Path("tests")
|
/ Path("tests")
|
||||||
/ Path("samples")
|
/ Path("samples")
|
||||||
|
/ Path("mail")
|
||||||
).resolve()
|
).resolve()
|
||||||
/ "html.eml",
|
/ "html.eml",
|
||||||
source=DocumentSource.MailFetch,
|
source=DocumentSource.MailFetch,
|
||||||
@@ -1138,8 +1126,6 @@ class TestConsumer(
|
|||||||
mock_mail_parser_parse.assert_called_once_with(
|
mock_mail_parser_parse.assert_called_once_with(
|
||||||
consumer.working_copy,
|
consumer.working_copy,
|
||||||
"message/rfc822",
|
"message/rfc822",
|
||||||
file_name="sample.pdf",
|
|
||||||
mailrule=mock_mailrule_get.return_value,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import datetime
|
import datetime
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -204,6 +205,52 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(document.filename, "none/none.pdf")
|
self.assertEqual(document.filename, "none/none.pdf")
|
||||||
|
|
||||||
|
@override_settings(FILENAME_FORMAT=None)
|
||||||
|
def test_stale_save_recovers_already_moved_files(self) -> None:
|
||||||
|
old_storage_path = StoragePath.objects.create(
|
||||||
|
name="old-path",
|
||||||
|
path="old/{{title}}",
|
||||||
|
)
|
||||||
|
new_storage_path = StoragePath.objects.create(
|
||||||
|
name="new-path",
|
||||||
|
path="new/{{title}}",
|
||||||
|
)
|
||||||
|
original_bytes = b"original"
|
||||||
|
archive_bytes = b"archive"
|
||||||
|
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title="document",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum=hashlib.md5(original_bytes).hexdigest(),
|
||||||
|
archive_checksum=hashlib.md5(archive_bytes).hexdigest(),
|
||||||
|
filename="old/document.pdf",
|
||||||
|
archive_filename="old/document.pdf",
|
||||||
|
storage_path=old_storage_path,
|
||||||
|
)
|
||||||
|
create_source_path_directory(doc.source_path)
|
||||||
|
doc.source_path.write_bytes(original_bytes)
|
||||||
|
create_source_path_directory(doc.archive_path)
|
||||||
|
doc.archive_path.write_bytes(archive_bytes)
|
||||||
|
|
||||||
|
stale_doc = Document.objects.get(pk=doc.pk)
|
||||||
|
fresh_doc = Document.objects.get(pk=doc.pk)
|
||||||
|
fresh_doc.storage_path = new_storage_path
|
||||||
|
fresh_doc.save()
|
||||||
|
doc.refresh_from_db()
|
||||||
|
self.assertEqual(doc.filename, "new/document.pdf")
|
||||||
|
self.assertEqual(doc.archive_filename, "new/document.pdf")
|
||||||
|
|
||||||
|
stale_doc.storage_path = new_storage_path
|
||||||
|
stale_doc.save()
|
||||||
|
|
||||||
|
doc.refresh_from_db()
|
||||||
|
self.assertEqual(doc.filename, "new/document.pdf")
|
||||||
|
self.assertEqual(doc.archive_filename, "new/document.pdf")
|
||||||
|
self.assertIsFile(doc.source_path)
|
||||||
|
self.assertIsFile(doc.archive_path)
|
||||||
|
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "document.pdf")
|
||||||
|
self.assertIsNotFile(settings.ARCHIVE_DIR / "old" / "document.pdf")
|
||||||
|
|
||||||
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
|
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||||
def test_document_delete(self) -> None:
|
def test_document_delete(self) -> None:
|
||||||
document = Document()
|
document = Document()
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import filecmp
|
import filecmp
|
||||||
import shutil
|
import shutil
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -11,6 +14,9 @@ from django.core.management import call_command
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.file_handling import generate_filename
|
from documents.file_handling import generate_filename
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.tasks import update_document_content_maybe_archive_file
|
from documents.tasks import update_document_content_maybe_archive_file
|
||||||
@@ -35,7 +41,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
doc = self.make_models()
|
doc = self.make_models()
|
||||||
shutil.copy(sample_file, Path(self.dirs.originals_dir) / f"{doc.id:07}.pdf")
|
shutil.copy(sample_file, Path(self.dirs.originals_dir) / f"{doc.id:07}.pdf")
|
||||||
|
|
||||||
call_command("document_archiver", "--processes", "1")
|
call_command("document_archiver", "--processes", "1", skip_checks=True)
|
||||||
|
|
||||||
def test_handle_document(self) -> None:
|
def test_handle_document(self) -> None:
|
||||||
doc = self.make_models()
|
doc = self.make_models()
|
||||||
@@ -100,12 +106,12 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
class TestMakeIndex(TestCase):
|
class TestMakeIndex(TestCase):
|
||||||
@mock.patch("documents.management.commands.document_index.index_reindex")
|
@mock.patch("documents.management.commands.document_index.index_reindex")
|
||||||
def test_reindex(self, m) -> None:
|
def test_reindex(self, m) -> None:
|
||||||
call_command("document_index", "reindex")
|
call_command("document_index", "reindex", skip_checks=True)
|
||||||
m.assert_called_once()
|
m.assert_called_once()
|
||||||
|
|
||||||
@mock.patch("documents.management.commands.document_index.index_optimize")
|
@mock.patch("documents.management.commands.document_index.index_optimize")
|
||||||
def test_optimize(self, m) -> None:
|
def test_optimize(self, m) -> None:
|
||||||
call_command("document_index", "optimize")
|
call_command("document_index", "optimize", skip_checks=True)
|
||||||
m.assert_called_once()
|
m.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
@@ -122,7 +128,7 @@ class TestRenamer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
Path(doc.archive_path).touch()
|
Path(doc.archive_path).touch()
|
||||||
|
|
||||||
with override_settings(FILENAME_FORMAT="{correspondent}/{title}"):
|
with override_settings(FILENAME_FORMAT="{correspondent}/{title}"):
|
||||||
call_command("document_renamer")
|
call_command("document_renamer", skip_checks=True)
|
||||||
|
|
||||||
doc2 = Document.objects.get(id=doc.id)
|
doc2 = Document.objects.get(id=doc.id)
|
||||||
|
|
||||||
@@ -135,14 +141,32 @@ class TestRenamer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.management
|
@pytest.mark.management
|
||||||
class TestCreateClassifier(TestCase):
|
class TestCreateClassifier:
|
||||||
@mock.patch(
|
def test_create_classifier(self, mocker: MockerFixture) -> None:
|
||||||
|
m = mocker.patch(
|
||||||
"documents.management.commands.document_create_classifier.train_classifier",
|
"documents.management.commands.document_create_classifier.train_classifier",
|
||||||
)
|
)
|
||||||
def test_create_classifier(self, m) -> None:
|
|
||||||
call_command("document_create_classifier")
|
|
||||||
|
|
||||||
m.assert_called_once()
|
call_command("document_create_classifier", skip_checks=True)
|
||||||
|
|
||||||
|
m.assert_called_once_with(scheduled=False, status_callback=mocker.ANY)
|
||||||
|
assert callable(m.call_args.kwargs["status_callback"])
|
||||||
|
|
||||||
|
def test_create_classifier_callback_output(self, mocker: MockerFixture) -> None:
|
||||||
|
"""Callback passed to train_classifier writes each phase message to the console."""
|
||||||
|
m = mocker.patch(
|
||||||
|
"documents.management.commands.document_create_classifier.train_classifier",
|
||||||
|
)
|
||||||
|
|
||||||
|
def invoke_callback(**kwargs):
|
||||||
|
kwargs["status_callback"]("Vectorizing document content...")
|
||||||
|
|
||||||
|
m.side_effect = invoke_callback
|
||||||
|
|
||||||
|
stdout = StringIO()
|
||||||
|
call_command("document_create_classifier", skip_checks=True, stdout=stdout)
|
||||||
|
|
||||||
|
assert "Vectorizing document content..." in stdout.getvalue()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.management
|
@pytest.mark.management
|
||||||
@@ -152,7 +176,7 @@ class TestConvertMariaDBUUID(TestCase):
|
|||||||
m.alter_field.return_value = None
|
m.alter_field.return_value = None
|
||||||
|
|
||||||
stdout = StringIO()
|
stdout = StringIO()
|
||||||
call_command("convert_mariadb_uuid", stdout=stdout)
|
call_command("convert_mariadb_uuid", stdout=stdout, skip_checks=True)
|
||||||
|
|
||||||
m.assert_called_once()
|
m.assert_called_once()
|
||||||
|
|
||||||
@@ -167,6 +191,6 @@ class TestPruneAuditLogs(TestCase):
|
|||||||
object_id=1,
|
object_id=1,
|
||||||
action=LogEntry.Action.CREATE,
|
action=LogEntry.Action.CREATE,
|
||||||
)
|
)
|
||||||
call_command("prune_audit_logs")
|
call_command("prune_audit_logs", skip_checks=True)
|
||||||
|
|
||||||
self.assertEqual(LogEntry.objects.count(), 0)
|
self.assertEqual(LogEntry.objects.count(), 0)
|
||||||
|
|||||||
@@ -180,7 +180,7 @@ class TestExportImport(
|
|||||||
if data_only:
|
if data_only:
|
||||||
args += ["--data-only"]
|
args += ["--data-only"]
|
||||||
|
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
with (self.target / "manifest.json").open() as f:
|
with (self.target / "manifest.json").open() as f:
|
||||||
manifest = json.load(f)
|
manifest = json.load(f)
|
||||||
@@ -272,7 +272,12 @@ class TestExportImport(
|
|||||||
GroupObjectPermission.objects.all().delete()
|
GroupObjectPermission.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
|
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
self.assertEqual(Tag.objects.count(), 1)
|
self.assertEqual(Tag.objects.count(), 1)
|
||||||
self.assertEqual(Correspondent.objects.count(), 1)
|
self.assertEqual(Correspondent.objects.count(), 1)
|
||||||
@@ -438,7 +443,8 @@ class TestExportImport(
|
|||||||
filename="0000010.pdf",
|
filename="0000010.pdf",
|
||||||
mime_type="application/pdf",
|
mime_type="application/pdf",
|
||||||
)
|
)
|
||||||
self.assertRaises(FileNotFoundError, call_command, "document_exporter", target)
|
with self.assertRaises(FileNotFoundError):
|
||||||
|
call_command("document_exporter", target, skip_checks=True)
|
||||||
|
|
||||||
def test_export_zipped(self) -> None:
|
def test_export_zipped(self) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -458,7 +464,7 @@ class TestExportImport(
|
|||||||
|
|
||||||
args = ["document_exporter", self.target, "--zip"]
|
args = ["document_exporter", self.target, "--zip"]
|
||||||
|
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
expected_file = str(
|
expected_file = str(
|
||||||
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
||||||
@@ -493,7 +499,7 @@ class TestExportImport(
|
|||||||
with override_settings(
|
with override_settings(
|
||||||
FILENAME_FORMAT="{created_year}/{correspondent}/{title}",
|
FILENAME_FORMAT="{created_year}/{correspondent}/{title}",
|
||||||
):
|
):
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
expected_file = str(
|
expected_file = str(
|
||||||
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
||||||
@@ -538,7 +544,7 @@ class TestExportImport(
|
|||||||
|
|
||||||
args = ["document_exporter", self.target, "--zip", "--delete"]
|
args = ["document_exporter", self.target, "--zip", "--delete"]
|
||||||
|
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
expected_file = str(
|
expected_file = str(
|
||||||
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
self.target / f"export-{timezone.localdate().isoformat()}.zip",
|
||||||
@@ -565,7 +571,7 @@ class TestExportImport(
|
|||||||
args = ["document_exporter", "/tmp/foo/bar"]
|
args = ["document_exporter", "/tmp/foo/bar"]
|
||||||
|
|
||||||
with self.assertRaises(CommandError) as e:
|
with self.assertRaises(CommandError) as e:
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
self.assertEqual("That path doesn't exist", str(e.exception))
|
self.assertEqual("That path doesn't exist", str(e.exception))
|
||||||
|
|
||||||
@@ -583,7 +589,7 @@ class TestExportImport(
|
|||||||
args = ["document_exporter", tmp_file.name]
|
args = ["document_exporter", tmp_file.name]
|
||||||
|
|
||||||
with self.assertRaises(CommandError) as e:
|
with self.assertRaises(CommandError) as e:
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
self.assertEqual("That path isn't a directory", str(e.exception))
|
self.assertEqual("That path isn't a directory", str(e.exception))
|
||||||
|
|
||||||
@@ -602,7 +608,7 @@ class TestExportImport(
|
|||||||
args = ["document_exporter", tmp_dir]
|
args = ["document_exporter", tmp_dir]
|
||||||
|
|
||||||
with self.assertRaises(CommandError) as e:
|
with self.assertRaises(CommandError) as e:
|
||||||
call_command(*args)
|
call_command(*args, skip_checks=True)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"That path doesn't appear to be writable",
|
"That path doesn't appear to be writable",
|
||||||
@@ -647,7 +653,12 @@ class TestExportImport(
|
|||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
Document.objects.all().delete()
|
Document.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
|
|
||||||
def test_no_thumbnail(self) -> None:
|
def test_no_thumbnail(self) -> None:
|
||||||
@@ -690,7 +701,12 @@ class TestExportImport(
|
|||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
Document.objects.all().delete()
|
Document.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
|
|
||||||
def test_split_manifest(self) -> None:
|
def test_split_manifest(self) -> None:
|
||||||
@@ -721,7 +737,12 @@ class TestExportImport(
|
|||||||
Document.objects.all().delete()
|
Document.objects.all().delete()
|
||||||
CustomFieldInstance.objects.all().delete()
|
CustomFieldInstance.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
self.assertEqual(CustomFieldInstance.objects.count(), 1)
|
self.assertEqual(CustomFieldInstance.objects.count(), 1)
|
||||||
|
|
||||||
@@ -746,7 +767,12 @@ class TestExportImport(
|
|||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
Document.objects.all().delete()
|
Document.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
|
|
||||||
def test_folder_prefix_with_split(self) -> None:
|
def test_folder_prefix_with_split(self) -> None:
|
||||||
@@ -771,7 +797,12 @@ class TestExportImport(
|
|||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
Document.objects.all().delete()
|
Document.objects.all().delete()
|
||||||
self.assertEqual(Document.objects.count(), 0)
|
self.assertEqual(Document.objects.count(), 0)
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertEqual(Document.objects.count(), 4)
|
self.assertEqual(Document.objects.count(), 4)
|
||||||
|
|
||||||
def test_import_db_transaction_failed(self) -> None:
|
def test_import_db_transaction_failed(self) -> None:
|
||||||
@@ -813,7 +844,12 @@ class TestExportImport(
|
|||||||
self.user = User.objects.create(username="temp_admin")
|
self.user = User.objects.create(username="temp_admin")
|
||||||
|
|
||||||
with self.assertRaises(IntegrityError):
|
with self.assertRaises(IntegrityError):
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
|
|
||||||
self.assertEqual(ContentType.objects.count(), num_content_type_objects)
|
self.assertEqual(ContentType.objects.count(), num_content_type_objects)
|
||||||
self.assertEqual(Permission.objects.count(), num_permission_objects + 1)
|
self.assertEqual(Permission.objects.count(), num_permission_objects + 1)
|
||||||
@@ -864,6 +900,7 @@ class TestExportImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
"--data-only",
|
"--data-only",
|
||||||
self.target,
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(Document.objects.all().count(), 4)
|
self.assertEqual(Document.objects.all().count(), 4)
|
||||||
@@ -923,6 +960,7 @@ class TestCryptExportImport(
|
|||||||
"--passphrase",
|
"--passphrase",
|
||||||
"securepassword",
|
"securepassword",
|
||||||
self.target,
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertIsFile(self.target / "metadata.json")
|
self.assertIsFile(self.target / "metadata.json")
|
||||||
@@ -948,6 +986,7 @@ class TestCryptExportImport(
|
|||||||
"--passphrase",
|
"--passphrase",
|
||||||
"securepassword",
|
"securepassword",
|
||||||
self.target,
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
account = MailAccount.objects.first()
|
account = MailAccount.objects.first()
|
||||||
@@ -976,6 +1015,7 @@ class TestCryptExportImport(
|
|||||||
"--passphrase",
|
"--passphrase",
|
||||||
"securepassword",
|
"securepassword",
|
||||||
self.target,
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
with self.assertRaises(CommandError) as err:
|
with self.assertRaises(CommandError) as err:
|
||||||
@@ -983,6 +1023,7 @@ class TestCryptExportImport(
|
|||||||
"document_importer",
|
"document_importer",
|
||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
self.target,
|
self.target,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
err.msg,
|
err.msg,
|
||||||
@@ -1014,6 +1055,7 @@ class TestCryptExportImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.target),
|
str(self.target),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ class TestFuzzyMatchCommand(TestCase):
|
|||||||
*args,
|
*args,
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
stderr=stderr,
|
stderr=stderr,
|
||||||
|
skip_checks=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
return stdout.getvalue(), stderr.getvalue()
|
return stdout.getvalue(), stderr.getvalue()
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ class TestCommandImport(
|
|||||||
"document_importer",
|
"document_importer",
|
||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"That directory doesn't appear to contain a manifest.json file.",
|
"That directory doesn't appear to contain a manifest.json file.",
|
||||||
@@ -67,6 +68,7 @@ class TestCommandImport(
|
|||||||
"document_importer",
|
"document_importer",
|
||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"The manifest file contains a record which does not refer to an actual document file.",
|
"The manifest file contains a record which does not refer to an actual document file.",
|
||||||
@@ -96,6 +98,7 @@ class TestCommandImport(
|
|||||||
"document_importer",
|
"document_importer",
|
||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
self.assertIn('The manifest file refers to "noexist.pdf"', str(e.exception))
|
self.assertIn('The manifest file refers to "noexist.pdf"', str(e.exception))
|
||||||
|
|
||||||
@@ -157,7 +160,7 @@ class TestCommandImport(
|
|||||||
- CommandError is raised indicating the issue
|
- CommandError is raised indicating the issue
|
||||||
"""
|
"""
|
||||||
with self.assertRaises(CommandError) as cm:
|
with self.assertRaises(CommandError) as cm:
|
||||||
call_command("document_importer", Path("/tmp/notapath"))
|
call_command("document_importer", Path("/tmp/notapath"), skip_checks=True)
|
||||||
self.assertIn("That path doesn't exist", str(cm.exception))
|
self.assertIn("That path doesn't exist", str(cm.exception))
|
||||||
|
|
||||||
def test_import_source_not_readable(self) -> None:
|
def test_import_source_not_readable(self) -> None:
|
||||||
@@ -173,7 +176,7 @@ class TestCommandImport(
|
|||||||
path = Path(temp_dir)
|
path = Path(temp_dir)
|
||||||
path.chmod(0o222)
|
path.chmod(0o222)
|
||||||
with self.assertRaises(CommandError) as cm:
|
with self.assertRaises(CommandError) as cm:
|
||||||
call_command("document_importer", path)
|
call_command("document_importer", path, skip_checks=True)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"That path doesn't appear to be readable",
|
"That path doesn't appear to be readable",
|
||||||
str(cm.exception),
|
str(cm.exception),
|
||||||
@@ -193,7 +196,12 @@ class TestCommandImport(
|
|||||||
self.assertIsNotFile(path)
|
self.assertIsNotFile(path)
|
||||||
|
|
||||||
with self.assertRaises(CommandError) as e:
|
with self.assertRaises(CommandError) as e:
|
||||||
call_command("document_importer", "--no-progress-bar", str(path))
|
call_command(
|
||||||
|
"document_importer",
|
||||||
|
"--no-progress-bar",
|
||||||
|
str(path),
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertIn("That path doesn't exist", str(e.exception))
|
self.assertIn("That path doesn't exist", str(e.exception))
|
||||||
|
|
||||||
def test_import_files_exist(self) -> None:
|
def test_import_files_exist(self) -> None:
|
||||||
@@ -218,6 +226,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
@@ -246,6 +255,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
@@ -282,6 +292,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
@@ -309,6 +320,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
stdout_str = str(stdout.read())
|
stdout_str = str(stdout.read())
|
||||||
@@ -338,6 +350,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(self.dirs.scratch_dir),
|
str(self.dirs.scratch_dir),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
stdout_str = str(stdout.read())
|
stdout_str = str(stdout.read())
|
||||||
@@ -377,6 +390,7 @@ class TestCommandImport(
|
|||||||
"--no-progress-bar",
|
"--no-progress-bar",
|
||||||
str(zip_path),
|
str(zip_path),
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
stdout.seek(0)
|
stdout.seek(0)
|
||||||
stdout_str = str(stdout.read())
|
stdout_str = str(stdout.read())
|
||||||
|
|||||||
@@ -139,7 +139,7 @@ class TestRetaggerTags(DirectoriesMixin):
|
|||||||
@pytest.mark.usefixtures("documents")
|
@pytest.mark.usefixtures("documents")
|
||||||
def test_add_tags(self, tags: TagTuple) -> None:
|
def test_add_tags(self, tags: TagTuple) -> None:
|
||||||
tag_first, tag_second, *_ = tags
|
tag_first, tag_second, *_ = tags
|
||||||
call_command("document_retagger", "--tags")
|
call_command("document_retagger", "--tags", skip_checks=True)
|
||||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||||
|
|
||||||
assert d_first.tags.count() == 1
|
assert d_first.tags.count() == 1
|
||||||
@@ -158,7 +158,7 @@ class TestRetaggerTags(DirectoriesMixin):
|
|||||||
tag_first, tag_second, tag_inbox, tag_no_match, _ = tags
|
tag_first, tag_second, tag_inbox, tag_no_match, _ = tags
|
||||||
d1.tags.add(tag_second)
|
d1.tags.add(tag_second)
|
||||||
|
|
||||||
call_command("document_retagger", "--tags", "--overwrite")
|
call_command("document_retagger", "--tags", "--overwrite", skip_checks=True)
|
||||||
|
|
||||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||||
|
|
||||||
@@ -180,7 +180,13 @@ class TestRetaggerTags(DirectoriesMixin):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suggest_does_not_apply_tags(self, extra_args: list[str]) -> None:
|
def test_suggest_does_not_apply_tags(self, extra_args: list[str]) -> None:
|
||||||
call_command("document_retagger", "--tags", "--suggest", *extra_args)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--tags",
|
||||||
|
"--suggest",
|
||||||
|
*extra_args,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
d_first, d_second, _, d_auto = _get_docs()
|
d_first, d_second, _, d_auto = _get_docs()
|
||||||
|
|
||||||
assert d_first.tags.count() == 0
|
assert d_first.tags.count() == 0
|
||||||
@@ -199,7 +205,7 @@ class TestRetaggerDocumentType(DirectoriesMixin):
|
|||||||
@pytest.mark.usefixtures("documents")
|
@pytest.mark.usefixtures("documents")
|
||||||
def test_add_type(self, document_types: DocumentTypeTuple) -> None:
|
def test_add_type(self, document_types: DocumentTypeTuple) -> None:
|
||||||
dt_first, dt_second = document_types
|
dt_first, dt_second = document_types
|
||||||
call_command("document_retagger", "--document_type")
|
call_command("document_retagger", "--document_type", skip_checks=True)
|
||||||
d_first, d_second, _, _ = _get_docs()
|
d_first, d_second, _, _ = _get_docs()
|
||||||
|
|
||||||
assert d_first.document_type == dt_first
|
assert d_first.document_type == dt_first
|
||||||
@@ -214,7 +220,13 @@ class TestRetaggerDocumentType(DirectoriesMixin):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suggest_does_not_apply_document_type(self, extra_args: list[str]) -> None:
|
def test_suggest_does_not_apply_document_type(self, extra_args: list[str]) -> None:
|
||||||
call_command("document_retagger", "--document_type", "--suggest", *extra_args)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--document_type",
|
||||||
|
"--suggest",
|
||||||
|
*extra_args,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
d_first, d_second, _, _ = _get_docs()
|
d_first, d_second, _, _ = _get_docs()
|
||||||
|
|
||||||
assert d_first.document_type is None
|
assert d_first.document_type is None
|
||||||
@@ -243,7 +255,12 @@ class TestRetaggerDocumentType(DirectoriesMixin):
|
|||||||
)
|
)
|
||||||
doc = DocumentFactory(content="ambiguous content")
|
doc = DocumentFactory(content="ambiguous content")
|
||||||
|
|
||||||
call_command("document_retagger", "--document_type", *use_first_flag)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--document_type",
|
||||||
|
*use_first_flag,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
|
|
||||||
doc.refresh_from_db()
|
doc.refresh_from_db()
|
||||||
assert (doc.document_type is not None) is expects_assignment
|
assert (doc.document_type is not None) is expects_assignment
|
||||||
@@ -260,7 +277,7 @@ class TestRetaggerCorrespondent(DirectoriesMixin):
|
|||||||
@pytest.mark.usefixtures("documents")
|
@pytest.mark.usefixtures("documents")
|
||||||
def test_add_correspondent(self, correspondents: CorrespondentTuple) -> None:
|
def test_add_correspondent(self, correspondents: CorrespondentTuple) -> None:
|
||||||
c_first, c_second = correspondents
|
c_first, c_second = correspondents
|
||||||
call_command("document_retagger", "--correspondent")
|
call_command("document_retagger", "--correspondent", skip_checks=True)
|
||||||
d_first, d_second, _, _ = _get_docs()
|
d_first, d_second, _, _ = _get_docs()
|
||||||
|
|
||||||
assert d_first.correspondent == c_first
|
assert d_first.correspondent == c_first
|
||||||
@@ -275,7 +292,13 @@ class TestRetaggerCorrespondent(DirectoriesMixin):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suggest_does_not_apply_correspondent(self, extra_args: list[str]) -> None:
|
def test_suggest_does_not_apply_correspondent(self, extra_args: list[str]) -> None:
|
||||||
call_command("document_retagger", "--correspondent", "--suggest", *extra_args)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--correspondent",
|
||||||
|
"--suggest",
|
||||||
|
*extra_args,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
d_first, d_second, _, _ = _get_docs()
|
d_first, d_second, _, _ = _get_docs()
|
||||||
|
|
||||||
assert d_first.correspondent is None
|
assert d_first.correspondent is None
|
||||||
@@ -304,7 +327,12 @@ class TestRetaggerCorrespondent(DirectoriesMixin):
|
|||||||
)
|
)
|
||||||
doc = DocumentFactory(content="ambiguous content")
|
doc = DocumentFactory(content="ambiguous content")
|
||||||
|
|
||||||
call_command("document_retagger", "--correspondent", *use_first_flag)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--correspondent",
|
||||||
|
*use_first_flag,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
|
|
||||||
doc.refresh_from_db()
|
doc.refresh_from_db()
|
||||||
assert (doc.correspondent is not None) is expects_assignment
|
assert (doc.correspondent is not None) is expects_assignment
|
||||||
@@ -326,7 +354,7 @@ class TestRetaggerStoragePath(DirectoriesMixin):
|
|||||||
THEN matching documents get the correct path; existing path is unchanged
|
THEN matching documents get the correct path; existing path is unchanged
|
||||||
"""
|
"""
|
||||||
sp1, sp2, sp3 = storage_paths
|
sp1, sp2, sp3 = storage_paths
|
||||||
call_command("document_retagger", "--storage_path")
|
call_command("document_retagger", "--storage_path", skip_checks=True)
|
||||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||||
|
|
||||||
assert d_first.storage_path == sp2
|
assert d_first.storage_path == sp2
|
||||||
@@ -342,7 +370,12 @@ class TestRetaggerStoragePath(DirectoriesMixin):
|
|||||||
THEN the existing path is replaced by the newly matched path
|
THEN the existing path is replaced by the newly matched path
|
||||||
"""
|
"""
|
||||||
sp1, sp2, _ = storage_paths
|
sp1, sp2, _ = storage_paths
|
||||||
call_command("document_retagger", "--storage_path", "--overwrite")
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--storage_path",
|
||||||
|
"--overwrite",
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
d_first, d_second, d_unrelated, d_auto = _get_docs()
|
||||||
|
|
||||||
assert d_first.storage_path == sp2
|
assert d_first.storage_path == sp2
|
||||||
@@ -373,7 +406,12 @@ class TestRetaggerStoragePath(DirectoriesMixin):
|
|||||||
)
|
)
|
||||||
doc = DocumentFactory(content="ambiguous content")
|
doc = DocumentFactory(content="ambiguous content")
|
||||||
|
|
||||||
call_command("document_retagger", "--storage_path", *use_first_flag)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--storage_path",
|
||||||
|
*use_first_flag,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
|
|
||||||
doc.refresh_from_db()
|
doc.refresh_from_db()
|
||||||
assert (doc.storage_path is not None) is expects_assignment
|
assert (doc.storage_path is not None) is expects_assignment
|
||||||
@@ -402,7 +440,13 @@ class TestRetaggerIdRange(DirectoriesMixin):
|
|||||||
expected_count: int,
|
expected_count: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
DocumentFactory(content="NOT the first document")
|
DocumentFactory(content="NOT the first document")
|
||||||
call_command("document_retagger", "--tags", "--id-range", *id_range_args)
|
call_command(
|
||||||
|
"document_retagger",
|
||||||
|
"--tags",
|
||||||
|
"--id-range",
|
||||||
|
*id_range_args,
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
tag_first, *_ = tags
|
tag_first, *_ = tags
|
||||||
assert Document.objects.filter(tags__id=tag_first.id).count() == expected_count
|
assert Document.objects.filter(tags__id=tag_first.id).count() == expected_count
|
||||||
|
|
||||||
@@ -416,7 +460,7 @@ class TestRetaggerIdRange(DirectoriesMixin):
|
|||||||
)
|
)
|
||||||
def test_id_range_invalid_arguments_raise(self, args: list[str]) -> None:
|
def test_id_range_invalid_arguments_raise(self, args: list[str]) -> None:
|
||||||
with pytest.raises((CommandError, SystemExit)):
|
with pytest.raises((CommandError, SystemExit)):
|
||||||
call_command("document_retagger", *args)
|
call_command("document_retagger", *args, skip_checks=True)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -430,12 +474,12 @@ class TestRetaggerEdgeCases(DirectoriesMixin):
|
|||||||
@pytest.mark.usefixtures("documents")
|
@pytest.mark.usefixtures("documents")
|
||||||
def test_no_targets_exits_cleanly(self) -> None:
|
def test_no_targets_exits_cleanly(self) -> None:
|
||||||
"""Calling the retagger with no classifier targets should not raise."""
|
"""Calling the retagger with no classifier targets should not raise."""
|
||||||
call_command("document_retagger")
|
call_command("document_retagger", skip_checks=True)
|
||||||
|
|
||||||
@pytest.mark.usefixtures("documents")
|
@pytest.mark.usefixtures("documents")
|
||||||
def test_inbox_only_skips_non_inbox_documents(self) -> None:
|
def test_inbox_only_skips_non_inbox_documents(self) -> None:
|
||||||
"""--inbox-only must restrict processing to documents with an inbox tag."""
|
"""--inbox-only must restrict processing to documents with an inbox tag."""
|
||||||
call_command("document_retagger", "--tags", "--inbox-only")
|
call_command("document_retagger", "--tags", "--inbox-only", skip_checks=True)
|
||||||
d_first, _, d_unrelated, _ = _get_docs()
|
d_first, _, d_unrelated, _ = _get_docs()
|
||||||
|
|
||||||
assert d_first.tags.count() == 0
|
assert d_first.tags.count() == 0
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ class TestManageSuperUser(DirectoriesMixin, TestCase):
|
|||||||
"--no-color",
|
"--no-color",
|
||||||
stdout=out,
|
stdout=out,
|
||||||
stderr=StringIO(),
|
stderr=StringIO(),
|
||||||
|
skip_checks=True,
|
||||||
)
|
)
|
||||||
return out.getvalue()
|
return out.getvalue()
|
||||||
|
|
||||||
|
|||||||
@@ -85,13 +85,20 @@ class TestMakeThumbnails(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
def test_command(self) -> None:
|
def test_command(self) -> None:
|
||||||
self.assertIsNotFile(self.d1.thumbnail_path)
|
self.assertIsNotFile(self.d1.thumbnail_path)
|
||||||
self.assertIsNotFile(self.d2.thumbnail_path)
|
self.assertIsNotFile(self.d2.thumbnail_path)
|
||||||
call_command("document_thumbnails", "--processes", "1")
|
call_command("document_thumbnails", "--processes", "1", skip_checks=True)
|
||||||
self.assertIsFile(self.d1.thumbnail_path)
|
self.assertIsFile(self.d1.thumbnail_path)
|
||||||
self.assertIsFile(self.d2.thumbnail_path)
|
self.assertIsFile(self.d2.thumbnail_path)
|
||||||
|
|
||||||
def test_command_documentid(self) -> None:
|
def test_command_documentid(self) -> None:
|
||||||
self.assertIsNotFile(self.d1.thumbnail_path)
|
self.assertIsNotFile(self.d1.thumbnail_path)
|
||||||
self.assertIsNotFile(self.d2.thumbnail_path)
|
self.assertIsNotFile(self.d2.thumbnail_path)
|
||||||
call_command("document_thumbnails", "--processes", "1", "-d", f"{self.d1.id}")
|
call_command(
|
||||||
|
"document_thumbnails",
|
||||||
|
"--processes",
|
||||||
|
"1",
|
||||||
|
"-d",
|
||||||
|
f"{self.d1.id}",
|
||||||
|
skip_checks=True,
|
||||||
|
)
|
||||||
self.assertIsFile(self.d1.thumbnail_path)
|
self.assertIsFile(self.d1.thumbnail_path)
|
||||||
self.assertIsNotFile(self.d2.thumbnail_path)
|
self.assertIsNotFile(self.d2.thumbnail_path)
|
||||||
|
|||||||
@@ -1,130 +1,14 @@
|
|||||||
from tempfile import TemporaryDirectory
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.apps import apps
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
|
|
||||||
from documents.parsers import get_default_file_extension
|
from documents.parsers import get_default_file_extension
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.parsers import get_supported_file_extensions
|
from documents.parsers import get_supported_file_extensions
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
from paperless.parsers.registry import reset_parser_registry
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserDiscovery(TestCase):
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Parser declared for a given mimetype
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- Declared parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Two parsers declared for a given mimetype
|
|
||||||
- Second parser has a higher weight
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- Second parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser1:
|
|
||||||
pass
|
|
||||||
|
|
||||||
class DummyParser2:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser1,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 1,
|
|
||||||
"parser": DummyParser2,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
get_parser_class_for_mime_type("application/pdf"),
|
|
||||||
DummyParser2,
|
|
||||||
)
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- No parsers are declared
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the mimetype
|
|
||||||
THEN:
|
|
||||||
- No parser class is returned
|
|
||||||
"""
|
|
||||||
m.return_value = []
|
|
||||||
with TemporaryDirectory():
|
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
||||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- No parser declared for a given mimetype
|
|
||||||
- Parser declared for a different mimetype
|
|
||||||
WHEN:
|
|
||||||
- Attempt to get parser for the given mimetype
|
|
||||||
THEN:
|
|
||||||
- No parser class is returned
|
|
||||||
"""
|
|
||||||
|
|
||||||
class DummyParser:
|
|
||||||
pass
|
|
||||||
|
|
||||||
m.return_value = (
|
|
||||||
(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"weight": 0,
|
|
||||||
"parser": DummyParser,
|
|
||||||
"mime_types": {"application/pdf": ".pdf"},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserAvailability(TestCase):
|
class TestParserAvailability(TestCase):
|
||||||
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
|
|||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
RasterisedDocumentParser,
|
RasterisedDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
|
|||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Force the app ready to notice the settings override
|
self.addCleanup(reset_parser_registry)
|
||||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
|
||||||
app = apps.get_app_config("paperless_tika")
|
# Reset and rebuild the registry with Tika enabled.
|
||||||
app.ready()
|
with override_settings(TIKA_ENABLED=True):
|
||||||
|
reset_parser_registry()
|
||||||
supported_exts = get_supported_file_extensions()
|
supported_exts = get_supported_file_extensions()
|
||||||
|
|
||||||
for mime_type, ext in supported_mimes_and_exts:
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
self.assertIn(ext, supported_exts)
|
self.assertIn(ext, supported_exts)
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||||
TikaDocumentParser,
|
TikaDocumentParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_no_parser_for_mime(self) -> None:
|
def test_no_parser_for_mime(self) -> None:
|
||||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))
|
||||||
|
|
||||||
def test_default_extension(self) -> None:
|
def test_default_extension(self) -> None:
|
||||||
# Test no parser declared still returns a an extension
|
# Test no parser declared still returns a an extension
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from rest_framework.test import APIClient
|
|||||||
from rest_framework.test import APITestCase
|
from rest_framework.test import APITestCase
|
||||||
|
|
||||||
from documents.file_handling import create_source_path_directory
|
from documents.file_handling import create_source_path_directory
|
||||||
|
from documents.file_handling import generate_filename
|
||||||
from documents.file_handling import generate_unique_filename
|
from documents.file_handling import generate_unique_filename
|
||||||
from documents.signals.handlers import run_workflows
|
from documents.signals.handlers import run_workflows
|
||||||
from documents.workflows.webhooks import send_webhook
|
from documents.workflows.webhooks import send_webhook
|
||||||
@@ -905,6 +906,121 @@ class TestWorkflows(
|
|||||||
expected_str = f"Document matched {trigger} from {w}"
|
expected_str = f"Document matched {trigger} from {w}"
|
||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
|
|
||||||
|
def test_workflow_assign_custom_field_keeps_storage_filename_in_sync(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Existing document with a storage path template that depends on a custom field
|
||||||
|
- Existing workflow triggered on document update assigning that custom field
|
||||||
|
WHEN:
|
||||||
|
- Workflow runs for the document
|
||||||
|
THEN:
|
||||||
|
- The database filename remains aligned with the moved file on disk
|
||||||
|
"""
|
||||||
|
storage_path = StoragePath.objects.create(
|
||||||
|
name="workflow-custom-field-path",
|
||||||
|
path="{{ custom_fields|get_cf_value('Custom Field 1', 'none') }}/{{ title }}",
|
||||||
|
)
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title="workflow custom field sync",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="workflow-custom-field-sync",
|
||||||
|
storage_path=storage_path,
|
||||||
|
original_filename="workflow-custom-field-sync.pdf",
|
||||||
|
)
|
||||||
|
CustomFieldInstance.objects.create(
|
||||||
|
document=doc,
|
||||||
|
field=self.cf1,
|
||||||
|
value_text="initial",
|
||||||
|
)
|
||||||
|
|
||||||
|
generated = generate_unique_filename(doc)
|
||||||
|
destination = (settings.ORIGINALS_DIR / generated).resolve()
|
||||||
|
create_source_path_directory(destination)
|
||||||
|
shutil.copy(self.SAMPLE_DIR / "simple.pdf", destination)
|
||||||
|
Document.objects.filter(pk=doc.pk).update(filename=generated.as_posix())
|
||||||
|
doc.refresh_from_db()
|
||||||
|
|
||||||
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED,
|
||||||
|
)
|
||||||
|
action = WorkflowAction.objects.create(
|
||||||
|
type=WorkflowAction.WorkflowActionType.ASSIGNMENT,
|
||||||
|
assign_custom_fields_values={self.cf1.pk: "cars"},
|
||||||
|
)
|
||||||
|
action.assign_custom_fields.add(self.cf1.pk)
|
||||||
|
workflow = Workflow.objects.create(
|
||||||
|
name="Workflow custom field filename sync",
|
||||||
|
order=0,
|
||||||
|
)
|
||||||
|
workflow.triggers.add(trigger)
|
||||||
|
workflow.actions.add(action)
|
||||||
|
workflow.save()
|
||||||
|
|
||||||
|
run_workflows(WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED, doc)
|
||||||
|
|
||||||
|
doc.refresh_from_db()
|
||||||
|
expected_filename = generate_filename(doc)
|
||||||
|
self.assertEqual(Path(doc.filename), expected_filename)
|
||||||
|
self.assertTrue(doc.source_path.is_file())
|
||||||
|
|
||||||
|
def test_workflow_document_updated_does_not_overwrite_filename(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- A document whose filename has been updated in the DB by a concurrent
|
||||||
|
bulk_update_documents task (simulating update_filename_and_move_files
|
||||||
|
completing and writing the new filename to the DB)
|
||||||
|
- A stale in-memory document instance still holding the old filename
|
||||||
|
- An active DOCUMENT_UPDATED workflow
|
||||||
|
WHEN:
|
||||||
|
- run_workflows is called with the stale in-memory instance
|
||||||
|
(as would happen in the second concurrent bulk_update_documents task)
|
||||||
|
THEN:
|
||||||
|
- The DB filename is NOT overwritten with the stale in-memory value
|
||||||
|
(regression test for GH #12386 — the race window between
|
||||||
|
refresh_from_db and document.save in run_workflows)
|
||||||
|
"""
|
||||||
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED,
|
||||||
|
)
|
||||||
|
action = WorkflowAction.objects.create(
|
||||||
|
type=WorkflowAction.WorkflowActionType.ASSIGNMENT,
|
||||||
|
assign_title="Updated by workflow",
|
||||||
|
)
|
||||||
|
workflow = Workflow.objects.create(name="Race condition test workflow", order=0)
|
||||||
|
workflow.triggers.add(trigger)
|
||||||
|
workflow.actions.add(action)
|
||||||
|
workflow.save()
|
||||||
|
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title="race condition test",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="racecondition123",
|
||||||
|
original_filename="old.pdf",
|
||||||
|
filename="old/path/old.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate BUD-1 completing update_filename_and_move_files:
|
||||||
|
# the DB now holds the new filename while BUD-2's in-memory instance is stale.
|
||||||
|
new_filename = "new/path/new.pdf"
|
||||||
|
Document.global_objects.filter(pk=doc.pk).update(filename=new_filename)
|
||||||
|
|
||||||
|
# The stale instance still has filename="old/path/old.pdf" in memory.
|
||||||
|
# Mock refresh_from_db so the stale value persists through run_workflows,
|
||||||
|
# replicating the race window between refresh and save.
|
||||||
|
# Mock update_filename_and_move_files to prevent file-not-found errors
|
||||||
|
# since we are only testing DB state here.
|
||||||
|
with (
|
||||||
|
mock.patch(
|
||||||
|
"documents.signals.handlers.update_filename_and_move_files",
|
||||||
|
),
|
||||||
|
mock.patch.object(Document, "refresh_from_db"),
|
||||||
|
):
|
||||||
|
run_workflows(WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED, doc)
|
||||||
|
|
||||||
|
# The DB filename must not have been reverted to the stale old value.
|
||||||
|
doc.refresh_from_db()
|
||||||
|
self.assertEqual(doc.filename, new_filename)
|
||||||
|
|
||||||
def test_document_added_workflow(self) -> None:
|
def test_document_added_workflow(self) -> None:
|
||||||
trigger = WorkflowTrigger.objects.create(
|
trigger = WorkflowTrigger.objects.create(
|
||||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ from rest_framework import serializers
|
|||||||
from rest_framework import status
|
from rest_framework import status
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.exceptions import NotFound
|
from rest_framework.exceptions import NotFound
|
||||||
|
from rest_framework.exceptions import PermissionDenied
|
||||||
from rest_framework.exceptions import ValidationError
|
from rest_framework.exceptions import ValidationError
|
||||||
from rest_framework.filters import OrderingFilter
|
from rest_framework.filters import OrderingFilter
|
||||||
from rest_framework.filters import SearchFilter
|
from rest_framework.filters import SearchFilter
|
||||||
@@ -157,7 +158,6 @@ from documents.models import UiSettings
|
|||||||
from documents.models import Workflow
|
from documents.models import Workflow
|
||||||
from documents.models import WorkflowAction
|
from documents.models import WorkflowAction
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.permissions import AcknowledgeTasksPermissions
|
from documents.permissions import AcknowledgeTasksPermissions
|
||||||
from documents.permissions import PaperlessAdminPermissions
|
from documents.permissions import PaperlessAdminPermissions
|
||||||
from documents.permissions import PaperlessNotePermissions
|
from documents.permissions import PaperlessNotePermissions
|
||||||
@@ -225,6 +225,7 @@ from paperless.celery import app as celery_app
|
|||||||
from paperless.config import AIConfig
|
from paperless.config import AIConfig
|
||||||
from paperless.config import GeneralConfig
|
from paperless.config import GeneralConfig
|
||||||
from paperless.models import ApplicationConfiguration
|
from paperless.models import ApplicationConfiguration
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.serialisers import GroupSerializer
|
from paperless.serialisers import GroupSerializer
|
||||||
from paperless.serialisers import UserSerializer
|
from paperless.serialisers import UserSerializer
|
||||||
from paperless.views import StandardPagination
|
from paperless.views import StandardPagination
|
||||||
@@ -1081,15 +1082,17 @@ class DocumentViewSet(
|
|||||||
if not Path(file).is_file():
|
if not Path(file).is_file():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
parser_class = get_parser_registry().get_parser_for_file(
|
||||||
|
mime_type,
|
||||||
|
Path(file).name,
|
||||||
|
Path(file),
|
||||||
|
)
|
||||||
if parser_class:
|
if parser_class:
|
||||||
parser = parser_class(progress_callback=None, logging_group=None)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
with parser_class() as parser:
|
||||||
return parser.extract_metadata(file, mime_type)
|
return parser.extract_metadata(file, mime_type)
|
||||||
except Exception: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
logger.exception(f"Issue getting metadata for {file}")
|
logger.exception(f"Issue getting metadata for {file}")
|
||||||
# TODO: cover GPG errors, remove later.
|
|
||||||
return []
|
return []
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
logger.warning(f"No parser for {mime_type}")
|
logger.warning(f"No parser for {mime_type}")
|
||||||
@@ -1328,6 +1331,7 @@ class DocumentViewSet(
|
|||||||
methods=["get", "post", "delete"],
|
methods=["get", "post", "delete"],
|
||||||
detail=True,
|
detail=True,
|
||||||
permission_classes=[PaperlessNotePermissions],
|
permission_classes=[PaperlessNotePermissions],
|
||||||
|
pagination_class=None,
|
||||||
filter_backends=[],
|
filter_backends=[],
|
||||||
)
|
)
|
||||||
def notes(self, request, pk=None):
|
def notes(self, request, pk=None):
|
||||||
@@ -1965,11 +1969,28 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
|||||||
filtered_queryset = super().filter_queryset(queryset)
|
filtered_queryset = super().filter_queryset(queryset)
|
||||||
|
|
||||||
if self._is_search_request():
|
if self._is_search_request():
|
||||||
|
if "query" in self.request.query_params:
|
||||||
from documents import index
|
from documents import index
|
||||||
|
|
||||||
if "query" in self.request.query_params:
|
|
||||||
query_class = index.DelayedFullTextQuery
|
query_class = index.DelayedFullTextQuery
|
||||||
elif "more_like_id" in self.request.query_params:
|
elif "more_like_id" in self.request.query_params:
|
||||||
|
try:
|
||||||
|
more_like_doc_id = int(self.request.query_params["more_like_id"])
|
||||||
|
more_like_doc = Document.objects.select_related("owner").get(
|
||||||
|
pk=more_like_doc_id,
|
||||||
|
)
|
||||||
|
except (TypeError, ValueError, Document.DoesNotExist):
|
||||||
|
raise PermissionDenied(_("Invalid more_like_id"))
|
||||||
|
|
||||||
|
if not has_perms_owner_aware(
|
||||||
|
self.request.user,
|
||||||
|
"view_document",
|
||||||
|
more_like_doc,
|
||||||
|
):
|
||||||
|
raise PermissionDenied(_("Insufficient permissions."))
|
||||||
|
|
||||||
|
from documents import index
|
||||||
|
|
||||||
query_class = index.DelayedMoreLikeThisQuery
|
query_class = index.DelayedMoreLikeThisQuery
|
||||||
else:
|
else:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
@@ -2005,6 +2026,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
|||||||
return response
|
return response
|
||||||
except NotFound:
|
except NotFound:
|
||||||
raise
|
raise
|
||||||
|
except PermissionDenied as e:
|
||||||
|
return HttpResponseForbidden(str(e.detail))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||||
return HttpResponseBadRequest(
|
return HttpResponseBadRequest(
|
||||||
@@ -2943,13 +2966,21 @@ class GlobalSearchView(PassUserMixin):
|
|||||||
)
|
)
|
||||||
groups = groups[:OBJECT_LIMIT]
|
groups = groups[:OBJECT_LIMIT]
|
||||||
mail_rules = (
|
mail_rules = (
|
||||||
MailRule.objects.filter(name__icontains=query)
|
get_objects_for_user_owner_aware(
|
||||||
|
request.user,
|
||||||
|
"view_mailrule",
|
||||||
|
MailRule,
|
||||||
|
).filter(name__icontains=query)
|
||||||
if request.user.has_perm("paperless_mail.view_mailrule")
|
if request.user.has_perm("paperless_mail.view_mailrule")
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
mail_rules = mail_rules[:OBJECT_LIMIT]
|
mail_rules = mail_rules[:OBJECT_LIMIT]
|
||||||
mail_accounts = (
|
mail_accounts = (
|
||||||
MailAccount.objects.filter(name__icontains=query)
|
get_objects_for_user_owner_aware(
|
||||||
|
request.user,
|
||||||
|
"view_mailaccount",
|
||||||
|
MailAccount,
|
||||||
|
).filter(name__icontains=query)
|
||||||
if request.user.has_perm("paperless_mail.view_mailaccount")
|
if request.user.has_perm("paperless_mail.view_mailaccount")
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
@@ -3923,7 +3954,7 @@ class CustomFieldViewSet(PermissionsAwareDocumentCountMixin, ModelViewSet):
|
|||||||
document_count_through = CustomFieldInstance
|
document_count_through = CustomFieldInstance
|
||||||
document_count_source_field = "field_id"
|
document_count_source_field = "field_id"
|
||||||
|
|
||||||
queryset = CustomField.objects.all().order_by("-created")
|
queryset = CustomField.objects.all().order_by("name")
|
||||||
|
|
||||||
|
|
||||||
@extend_schema_view(
|
@extend_schema_view(
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
import ipaddress
|
|
||||||
import logging
|
import logging
|
||||||
import socket
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
from paperless.network import format_host_for_url
|
||||||
|
from paperless.network import is_public_ip
|
||||||
|
from paperless.network import resolve_hostname_ips
|
||||||
|
from paperless.network import validate_outbound_http_url
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.workflows.webhooks")
|
logger = logging.getLogger("paperless.workflows.webhooks")
|
||||||
|
|
||||||
|
|
||||||
@@ -34,23 +36,19 @@ class WebhookTransport(httpx.HTTPTransport):
|
|||||||
raise httpx.ConnectError("No hostname in request URL")
|
raise httpx.ConnectError("No hostname in request URL")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
addr_info = socket.getaddrinfo(hostname, None)
|
ips = resolve_hostname_ips(hostname)
|
||||||
except socket.gaierror as e:
|
except ValueError as e:
|
||||||
raise httpx.ConnectError(f"Could not resolve hostname: {hostname}") from e
|
raise httpx.ConnectError(str(e)) from e
|
||||||
|
|
||||||
ips = [info[4][0] for info in addr_info if info and info[4]]
|
|
||||||
if not ips:
|
|
||||||
raise httpx.ConnectError(f"Could not resolve hostname: {hostname}")
|
|
||||||
|
|
||||||
if not self.allow_internal:
|
if not self.allow_internal:
|
||||||
for ip_str in ips:
|
for ip_str in ips:
|
||||||
if not WebhookTransport.is_public_ip(ip_str):
|
if not is_public_ip(ip_str):
|
||||||
raise httpx.ConnectError(
|
raise httpx.ConnectError(
|
||||||
f"Connection blocked: {hostname} resolves to a non-public address",
|
f"Connection blocked: {hostname} resolves to a non-public address",
|
||||||
)
|
)
|
||||||
|
|
||||||
ip_str = ips[0]
|
ip_str = ips[0]
|
||||||
formatted_ip = self._format_ip_for_url(ip_str)
|
formatted_ip = format_host_for_url(ip_str)
|
||||||
|
|
||||||
new_headers = httpx.Headers(request.headers)
|
new_headers = httpx.Headers(request.headers)
|
||||||
if "host" in new_headers:
|
if "host" in new_headers:
|
||||||
@@ -69,40 +67,6 @@ class WebhookTransport(httpx.HTTPTransport):
|
|||||||
|
|
||||||
return super().handle_request(request)
|
return super().handle_request(request)
|
||||||
|
|
||||||
def _format_ip_for_url(self, ip: str) -> str:
|
|
||||||
"""
|
|
||||||
Format IP address for use in URL (wrap IPv6 in brackets)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
ip_obj = ipaddress.ip_address(ip)
|
|
||||||
if ip_obj.version == 6:
|
|
||||||
return f"[{ip}]"
|
|
||||||
return ip
|
|
||||||
except ValueError:
|
|
||||||
return ip
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_public_ip(ip: str | int) -> bool:
|
|
||||||
try:
|
|
||||||
obj = ipaddress.ip_address(ip)
|
|
||||||
return not (
|
|
||||||
obj.is_private
|
|
||||||
or obj.is_loopback
|
|
||||||
or obj.is_link_local
|
|
||||||
or obj.is_multicast
|
|
||||||
or obj.is_unspecified
|
|
||||||
)
|
|
||||||
except ValueError: # pragma: no cover
|
|
||||||
return False
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_first_ip(host: str) -> str | None:
|
|
||||||
try:
|
|
||||||
info = socket.getaddrinfo(host, None)
|
|
||||||
return info[0][4][0] if info else None
|
|
||||||
except Exception: # pragma: no cover
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task(
|
@shared_task(
|
||||||
retry_backoff=True,
|
retry_backoff=True,
|
||||||
@@ -118,21 +82,24 @@ def send_webhook(
|
|||||||
*,
|
*,
|
||||||
as_json: bool = False,
|
as_json: bool = False,
|
||||||
):
|
):
|
||||||
p = urlparse(url)
|
try:
|
||||||
if p.scheme.lower() not in settings.WEBHOOKS_ALLOWED_SCHEMES or not p.hostname:
|
parsed = validate_outbound_http_url(
|
||||||
logger.warning("Webhook blocked: invalid scheme/hostname")
|
url,
|
||||||
|
allowed_schemes=settings.WEBHOOKS_ALLOWED_SCHEMES,
|
||||||
|
allowed_ports=settings.WEBHOOKS_ALLOWED_PORTS,
|
||||||
|
# Internal-address checks happen in transport to preserve ConnectError behavior.
|
||||||
|
allow_internal=True,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning("Webhook blocked: %s", e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
hostname = parsed.hostname
|
||||||
|
if hostname is None: # pragma: no cover
|
||||||
raise ValueError("Invalid URL scheme or hostname.")
|
raise ValueError("Invalid URL scheme or hostname.")
|
||||||
|
|
||||||
port = p.port or (443 if p.scheme == "https" else 80)
|
|
||||||
if (
|
|
||||||
len(settings.WEBHOOKS_ALLOWED_PORTS) > 0
|
|
||||||
and port not in settings.WEBHOOKS_ALLOWED_PORTS
|
|
||||||
):
|
|
||||||
logger.warning("Webhook blocked: port not permitted")
|
|
||||||
raise ValueError("Destination port not permitted.")
|
|
||||||
|
|
||||||
transport = WebhookTransport(
|
transport = WebhookTransport(
|
||||||
hostname=p.hostname,
|
hostname=hostname,
|
||||||
allow_internal=settings.WEBHOOKS_ALLOW_INTERNAL_REQUESTS,
|
allow_internal=settings.WEBHOOKS_ALLOW_INTERNAL_REQUESTS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ msgid ""
|
|||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: paperless-ngx\n"
|
"Project-Id-Version: paperless-ngx\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2026-03-12 15:43+0000\n"
|
"POT-Creation-Date: 2026-03-22 13:54+0000\n"
|
||||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||||
"Last-Translator: \n"
|
"Last-Translator: \n"
|
||||||
"Language-Team: English\n"
|
"Language-Team: English\n"
|
||||||
@@ -1299,7 +1299,9 @@ msgstr ""
|
|||||||
msgid "workflow runs"
|
msgid "workflow runs"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:463 documents/serialisers.py:2470
|
#: documents/serialisers.py:463 documents/serialisers.py:815
|
||||||
|
#: documents/serialisers.py:2501 documents/views.py:1990
|
||||||
|
#: paperless_mail/serialisers.py:143
|
||||||
msgid "Insufficient permissions."
|
msgid "Insufficient permissions."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
@@ -1307,39 +1309,39 @@ msgstr ""
|
|||||||
msgid "Invalid color."
|
msgid "Invalid color."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2093
|
#: documents/serialisers.py:2124
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "File type %(type)s not supported"
|
msgid "File type %(type)s not supported"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2137
|
#: documents/serialisers.py:2168
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "Custom field id must be an integer: %(id)s"
|
msgid "Custom field id must be an integer: %(id)s"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2144
|
#: documents/serialisers.py:2175
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "Custom field with id %(id)s does not exist"
|
msgid "Custom field with id %(id)s does not exist"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2161 documents/serialisers.py:2171
|
#: documents/serialisers.py:2192 documents/serialisers.py:2202
|
||||||
msgid ""
|
msgid ""
|
||||||
"Custom fields must be a list of integers or an object mapping ids to values."
|
"Custom fields must be a list of integers or an object mapping ids to values."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2166
|
#: documents/serialisers.py:2197
|
||||||
msgid "Some custom fields don't exist or were specified twice."
|
msgid "Some custom fields don't exist or were specified twice."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2313
|
#: documents/serialisers.py:2344
|
||||||
msgid "Invalid variable detected."
|
msgid "Invalid variable detected."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2526
|
#: documents/serialisers.py:2557
|
||||||
msgid "Duplicate document identifiers are not allowed."
|
msgid "Duplicate document identifiers are not allowed."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/serialisers.py:2556 documents/views.py:3565
|
#: documents/serialisers.py:2587 documents/views.py:3596
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "Documents not found: %(ids)s"
|
msgid "Documents not found: %(ids)s"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
@@ -1603,20 +1605,24 @@ msgstr ""
|
|||||||
msgid "Unable to parse URI {value}"
|
msgid "Unable to parse URI {value}"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/views.py:3577
|
#: documents/views.py:1983
|
||||||
|
msgid "Invalid more_like_id"
|
||||||
|
msgstr ""
|
||||||
|
|
||||||
|
#: documents/views.py:3608
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "Insufficient permissions to share document %(id)s."
|
msgid "Insufficient permissions to share document %(id)s."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/views.py:3620
|
#: documents/views.py:3651
|
||||||
msgid "Bundle is already being processed."
|
msgid "Bundle is already being processed."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/views.py:3677
|
#: documents/views.py:3708
|
||||||
msgid "The share link bundle is still being prepared. Please try again later."
|
msgid "The share link bundle is still being prepared. Please try again later."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: documents/views.py:3687
|
#: documents/views.py:3718
|
||||||
msgid "The share link bundle is unavailable."
|
msgid "The share link bundle is unavailable."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
@@ -1856,151 +1862,151 @@ msgstr ""
|
|||||||
msgid "paperless application settings"
|
msgid "paperless application settings"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:521
|
#: paperless/settings/__init__.py:518
|
||||||
msgid "English (US)"
|
msgid "English (US)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:522
|
#: paperless/settings/__init__.py:519
|
||||||
msgid "Arabic"
|
msgid "Arabic"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:523
|
#: paperless/settings/__init__.py:520
|
||||||
msgid "Afrikaans"
|
msgid "Afrikaans"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:524
|
#: paperless/settings/__init__.py:521
|
||||||
msgid "Belarusian"
|
msgid "Belarusian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:525
|
#: paperless/settings/__init__.py:522
|
||||||
msgid "Bulgarian"
|
msgid "Bulgarian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:526
|
#: paperless/settings/__init__.py:523
|
||||||
msgid "Catalan"
|
msgid "Catalan"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:527
|
#: paperless/settings/__init__.py:524
|
||||||
msgid "Czech"
|
msgid "Czech"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:528
|
#: paperless/settings/__init__.py:525
|
||||||
msgid "Danish"
|
msgid "Danish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:529
|
#: paperless/settings/__init__.py:526
|
||||||
msgid "German"
|
msgid "German"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:530
|
#: paperless/settings/__init__.py:527
|
||||||
msgid "Greek"
|
msgid "Greek"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:531
|
#: paperless/settings/__init__.py:528
|
||||||
msgid "English (GB)"
|
msgid "English (GB)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:532
|
#: paperless/settings/__init__.py:529
|
||||||
msgid "Spanish"
|
msgid "Spanish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:533
|
#: paperless/settings/__init__.py:530
|
||||||
msgid "Persian"
|
msgid "Persian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:534
|
#: paperless/settings/__init__.py:531
|
||||||
msgid "Finnish"
|
msgid "Finnish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:535
|
#: paperless/settings/__init__.py:532
|
||||||
msgid "French"
|
msgid "French"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:536
|
#: paperless/settings/__init__.py:533
|
||||||
msgid "Hungarian"
|
msgid "Hungarian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:537
|
#: paperless/settings/__init__.py:534
|
||||||
msgid "Indonesian"
|
msgid "Indonesian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:538
|
#: paperless/settings/__init__.py:535
|
||||||
msgid "Italian"
|
msgid "Italian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:539
|
#: paperless/settings/__init__.py:536
|
||||||
msgid "Japanese"
|
msgid "Japanese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:540
|
#: paperless/settings/__init__.py:537
|
||||||
msgid "Korean"
|
msgid "Korean"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:541
|
#: paperless/settings/__init__.py:538
|
||||||
msgid "Luxembourgish"
|
msgid "Luxembourgish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:542
|
#: paperless/settings/__init__.py:539
|
||||||
msgid "Norwegian"
|
msgid "Norwegian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:543
|
#: paperless/settings/__init__.py:540
|
||||||
msgid "Dutch"
|
msgid "Dutch"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:544
|
#: paperless/settings/__init__.py:541
|
||||||
msgid "Polish"
|
msgid "Polish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:545
|
#: paperless/settings/__init__.py:542
|
||||||
msgid "Portuguese (Brazil)"
|
msgid "Portuguese (Brazil)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:546
|
#: paperless/settings/__init__.py:543
|
||||||
msgid "Portuguese"
|
msgid "Portuguese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:547
|
#: paperless/settings/__init__.py:544
|
||||||
msgid "Romanian"
|
msgid "Romanian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:548
|
#: paperless/settings/__init__.py:545
|
||||||
msgid "Russian"
|
msgid "Russian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:549
|
#: paperless/settings/__init__.py:546
|
||||||
msgid "Slovak"
|
msgid "Slovak"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:550
|
#: paperless/settings/__init__.py:547
|
||||||
msgid "Slovenian"
|
msgid "Slovenian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:551
|
#: paperless/settings/__init__.py:548
|
||||||
msgid "Serbian"
|
msgid "Serbian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:552
|
#: paperless/settings/__init__.py:549
|
||||||
msgid "Swedish"
|
msgid "Swedish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:553
|
#: paperless/settings/__init__.py:550
|
||||||
msgid "Turkish"
|
msgid "Turkish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:554
|
#: paperless/settings/__init__.py:551
|
||||||
msgid "Ukrainian"
|
msgid "Ukrainian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:555
|
#: paperless/settings/__init__.py:552
|
||||||
msgid "Vietnamese"
|
msgid "Vietnamese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:556
|
#: paperless/settings/__init__.py:553
|
||||||
msgid "Chinese Simplified"
|
msgid "Chinese Simplified"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings/__init__.py:557
|
#: paperless/settings/__init__.py:554
|
||||||
msgid "Chinese Traditional"
|
msgid "Chinese Traditional"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
@@ -2046,7 +2052,7 @@ msgid ""
|
|||||||
"process all matching rules that you have defined."
|
"process all matching rules that you have defined."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless_mail/apps.py:11
|
#: paperless_mail/apps.py:8
|
||||||
msgid "Paperless mail"
|
msgid "Paperless mail"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
|
|||||||
@@ -83,3 +83,11 @@ class PaperlessBasicAuthentication(authentication.BasicAuthentication):
|
|||||||
raise exceptions.AuthenticationFailed("MFA required")
|
raise exceptions.AuthenticationFailed("MFA required")
|
||||||
|
|
||||||
return user_tuple
|
return user_tuple
|
||||||
|
|
||||||
|
def authenticate_header(self, request):
|
||||||
|
auth_header = request.META.get("HTTP_AUTHORIZATION", "")
|
||||||
|
if auth_header.lower().startswith("basic "):
|
||||||
|
return super().authenticate_header(request)
|
||||||
|
|
||||||
|
# Still 401 for anonymous API access
|
||||||
|
return authentication.TokenAuthentication.keyword
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import os
|
|||||||
import pwd
|
import pwd
|
||||||
import shutil
|
import shutil
|
||||||
import stat
|
import stat
|
||||||
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@@ -299,3 +300,62 @@ def check_deprecated_db_settings(
|
|||||||
)
|
)
|
||||||
|
|
||||||
return warnings
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
@register()
|
||||||
|
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||||
|
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||||
|
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
Error(
|
||||||
|
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_tesseract_langs():
|
||||||
|
proc = subprocess.run(
|
||||||
|
[shutil.which("tesseract"), "--list-langs"],
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode bytes to string, split on newlines, trim out the header
|
||||||
|
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||||
|
|
||||||
|
return [x.strip() for x in proc_lines]
|
||||||
|
|
||||||
|
|
||||||
|
@register()
|
||||||
|
def check_default_language_available(app_configs, **kwargs):
|
||||||
|
errs = []
|
||||||
|
|
||||||
|
if not settings.OCR_LANGUAGE:
|
||||||
|
errs.append(
|
||||||
|
Warning(
|
||||||
|
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||||
|
"This means that tesseract will fallback to english.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return errs
|
||||||
|
|
||||||
|
# binaries_check in paperless will check and report if this doesn't exist
|
||||||
|
# So skip trying to do anything here and let that handle missing binaries
|
||||||
|
if shutil.which("tesseract") is not None:
|
||||||
|
installed_langs = get_tesseract_langs()
|
||||||
|
|
||||||
|
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||||
|
|
||||||
|
for lang in specified_langs:
|
||||||
|
if lang not in installed_langs:
|
||||||
|
errs.append(
|
||||||
|
Error(
|
||||||
|
f"The selected ocr language {lang} is "
|
||||||
|
f"not installed. Paperless cannot OCR your documents "
|
||||||
|
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return errs
|
||||||
|
|||||||
@@ -188,6 +188,7 @@ class AIConfig(BaseConfig):
|
|||||||
llm_model: str = dataclasses.field(init=False)
|
llm_model: str = dataclasses.field(init=False)
|
||||||
llm_api_key: str = dataclasses.field(init=False)
|
llm_api_key: str = dataclasses.field(init=False)
|
||||||
llm_endpoint: str = dataclasses.field(init=False)
|
llm_endpoint: str = dataclasses.field(init=False)
|
||||||
|
llm_allow_internal_endpoints: bool = dataclasses.field(init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
app_config = self._get_config_instance()
|
app_config = self._get_config_instance()
|
||||||
@@ -203,6 +204,7 @@ class AIConfig(BaseConfig):
|
|||||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||||
self.llm_endpoint = app_config.llm_endpoint or settings.LLM_ENDPOINT
|
self.llm_endpoint = app_config.llm_endpoint or settings.LLM_ENDPOINT
|
||||||
|
self.llm_allow_internal_endpoints = settings.LLM_ALLOW_INTERNAL_ENDPOINTS
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def llm_index_enabled(self) -> bool:
|
def llm_index_enabled(self) -> bool:
|
||||||
|
|||||||
76
src/paperless/network.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import ipaddress
|
||||||
|
import socket
|
||||||
|
from collections.abc import Collection
|
||||||
|
from urllib.parse import ParseResult
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
def is_public_ip(ip: str | int) -> bool:
|
||||||
|
try:
|
||||||
|
obj = ipaddress.ip_address(ip)
|
||||||
|
return not (
|
||||||
|
obj.is_private
|
||||||
|
or obj.is_loopback
|
||||||
|
or obj.is_link_local
|
||||||
|
or obj.is_multicast
|
||||||
|
or obj.is_unspecified
|
||||||
|
)
|
||||||
|
except ValueError: # pragma: no cover
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_hostname_ips(hostname: str) -> list[str]:
|
||||||
|
try:
|
||||||
|
addr_info = socket.getaddrinfo(hostname, None)
|
||||||
|
except socket.gaierror as e:
|
||||||
|
raise ValueError(f"Could not resolve hostname: {hostname}") from e
|
||||||
|
|
||||||
|
ips = [info[4][0] for info in addr_info if info and info[4]]
|
||||||
|
if not ips:
|
||||||
|
raise ValueError(f"Could not resolve hostname: {hostname}")
|
||||||
|
return ips
|
||||||
|
|
||||||
|
|
||||||
|
def format_host_for_url(host: str) -> str:
|
||||||
|
"""
|
||||||
|
Format IP address for URL use (wrap IPv6 in brackets).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
ip_obj = ipaddress.ip_address(host)
|
||||||
|
if ip_obj.version == 6:
|
||||||
|
return f"[{host}]"
|
||||||
|
return host
|
||||||
|
except ValueError:
|
||||||
|
return host
|
||||||
|
|
||||||
|
|
||||||
|
def validate_outbound_http_url(
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
allowed_schemes: Collection[str] = ("http", "https"),
|
||||||
|
allowed_ports: Collection[int] | None = None,
|
||||||
|
allow_internal: bool = False,
|
||||||
|
) -> ParseResult:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
scheme = parsed.scheme.lower()
|
||||||
|
|
||||||
|
if scheme not in allowed_schemes or not parsed.hostname:
|
||||||
|
raise ValueError("Invalid URL scheme or hostname.")
|
||||||
|
|
||||||
|
default_port = 443 if scheme == "https" else 80
|
||||||
|
try:
|
||||||
|
port = parsed.port or default_port
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError("Invalid URL scheme or hostname.") from e
|
||||||
|
|
||||||
|
if allowed_ports and port not in allowed_ports:
|
||||||
|
raise ValueError("Destination port not permitted.")
|
||||||
|
|
||||||
|
if not allow_internal:
|
||||||
|
for ip_str in resolve_hostname_ips(parsed.hostname):
|
||||||
|
if not is_public_ip(ip_str):
|
||||||
|
raise ValueError(
|
||||||
|
f"Connection blocked: {parsed.hostname} resolves to a non-public address",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parsed
|
||||||
@@ -35,6 +35,7 @@ Usage example (third-party parser)::
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
from typing import Self
|
from typing import Self
|
||||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"MetadataEntry",
|
"MetadataEntry",
|
||||||
|
"ParserContext",
|
||||||
"ParserProtocol",
|
"ParserProtocol",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
|
|||||||
"""String representation of the field value."""
|
"""String representation of the field value."""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class ParserContext:
|
||||||
|
"""Immutable context passed to a parser before parse().
|
||||||
|
|
||||||
|
The consumer assembles this from the ingestion event and Django
|
||||||
|
settings, then calls ``parser.configure(context)`` before
|
||||||
|
``parser.parse()``. Parsers read only the fields relevant to them;
|
||||||
|
unneeded fields are ignored.
|
||||||
|
|
||||||
|
``frozen=True`` prevents accidental mutation after the consumer
|
||||||
|
hands the context off. ``slots=True`` keeps instances lightweight.
|
||||||
|
|
||||||
|
Fields
|
||||||
|
------
|
||||||
|
mailrule_id : int | None
|
||||||
|
Primary key of the ``MailRule`` that triggered this ingestion,
|
||||||
|
or ``None`` when the document did not arrive via a mail rule.
|
||||||
|
Used by ``MailDocumentParser`` to select the PDF layout.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Future fields (not yet implemented):
|
||||||
|
|
||||||
|
* ``output_type`` — PDF/A variant for archive generation
|
||||||
|
(replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
|
||||||
|
* ``ocr_mode`` — skip-text, redo, force, etc.
|
||||||
|
(replaces ``settings.OCR_MODE`` reads inside parsers).
|
||||||
|
* ``ocr_language`` — Tesseract language string.
|
||||||
|
(replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
|
||||||
|
|
||||||
|
When those fields are added the consumer will read from Django
|
||||||
|
settings once and populate them here, decoupling parsers from
|
||||||
|
``settings.*`` entirely.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mailrule_id: int | None = None
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class ParserProtocol(Protocol):
|
class ParserProtocol(Protocol):
|
||||||
"""Structural contract for all Paperless-ngx document parsers.
|
"""Structural contract for all Paperless-ngx document parsers.
|
||||||
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
|
|||||||
# Core parsing interface
|
# Core parsing interface
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
"""Apply source context before parse().
|
||||||
|
|
||||||
|
Called by the consumer after instantiation and before parse().
|
||||||
|
The default implementation is a no-op; parsers override only the
|
||||||
|
fields they need.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
context:
|
||||||
|
Immutable context assembled by the consumer for this
|
||||||
|
specific ingestion event.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self,
|
self,
|
||||||
document_path: Path,
|
document_path: Path,
|
||||||
|
|||||||
834
src/paperless/parsers/mail.py
Normal file
@@ -0,0 +1,834 @@
|
|||||||
|
"""
|
||||||
|
Built-in mail document parser.
|
||||||
|
|
||||||
|
Handles message/rfc822 (EML) MIME type by:
|
||||||
|
- Parsing the email using imap_tools
|
||||||
|
- Generating a PDF via Gotenberg (for display and archive)
|
||||||
|
- Extracting text via Tika for HTML content
|
||||||
|
- Extracting metadata from email headers
|
||||||
|
|
||||||
|
The parser always produces a PDF because EML files cannot be rendered
|
||||||
|
natively in a browser (requires_pdf_rendition=True).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from html import escape
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from bleach import clean
|
||||||
|
from bleach import linkify
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
from django.utils.timezone import is_naive
|
||||||
|
from django.utils.timezone import make_aware
|
||||||
|
from gotenberg_client import GotenbergClient
|
||||||
|
from gotenberg_client.constants import A4
|
||||||
|
from gotenberg_client.options import Measurement
|
||||||
|
from gotenberg_client.options import MeasurementUnitType
|
||||||
|
from gotenberg_client.options import PageMarginsType
|
||||||
|
from gotenberg_client.options import PdfAFormat
|
||||||
|
from humanize import naturalsize
|
||||||
|
from imap_tools import MailAttachment
|
||||||
|
from imap_tools import MailMessage
|
||||||
|
from tika_client import TikaClient
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from paperless.models import OutputTypeChoices
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
from paperless_mail.models import MailRule
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.mail")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"message/rfc822": ".eml",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MailDocumentParser:
|
||||||
|
"""Parse .eml email files for Paperless-ngx.
|
||||||
|
|
||||||
|
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
|
||||||
|
and sends the HTML part to a Tika server for text extraction. Because
|
||||||
|
EML files cannot be rendered natively in a browser, the parser always
|
||||||
|
produces a PDF rendition (requires_pdf_rendition=True).
|
||||||
|
|
||||||
|
Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
|
||||||
|
apply mail-rule-specific PDF layout options:
|
||||||
|
|
||||||
|
parser.configure(ParserContext(mailrule_id=rule.pk))
|
||||||
|
parser.parse(path, mime_type)
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Mail Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser handles.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
10 if the MIME type is supported, otherwise None.
|
||||||
|
"""
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — the mail parser produces a display PDF
|
||||||
|
(requires_pdf_rendition=True), not an optional OCR archive.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — EML files cannot be rendered natively in a browser,
|
||||||
|
so a PDF conversion is always required for display.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
self._date: datetime.datetime | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
self._mailrule_id: int | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
self._mailrule_id = context.mailrule_id
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Parse the given .eml into formatted text and a PDF archive.
|
||||||
|
|
||||||
|
Call ``configure(ParserContext(mailrule_id=...))`` before this method
|
||||||
|
to apply mail-rule-specific PDF layout options. The ``produce_archive``
|
||||||
|
flag is accepted for protocol compatibility but is always honoured —
|
||||||
|
the mail parser always produces a PDF since EML files cannot be
|
||||||
|
displayed natively.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the .eml file.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document (should be "message/rfc822").
|
||||||
|
produce_archive:
|
||||||
|
Accepted for protocol compatibility. The PDF rendition is always
|
||||||
|
produced since EML files cannot be displayed natively in a browser.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If the file cannot be parsed or PDF generation fails.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def strip_text(text: str) -> str:
|
||||||
|
"""Reduces the spacing of the given text string."""
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
text = re.sub(r"(\n *)+", "\n", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def build_formatted_text(mail_message: MailMessage) -> str:
|
||||||
|
"""Constructs a formatted string based on the given email."""
|
||||||
|
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
||||||
|
fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
|
||||||
|
to_list = [address.full for address in mail_message.to_values]
|
||||||
|
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
||||||
|
if mail_message.cc_values:
|
||||||
|
fmt_text += (
|
||||||
|
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
||||||
|
)
|
||||||
|
if mail_message.bcc_values:
|
||||||
|
fmt_text += (
|
||||||
|
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
||||||
|
)
|
||||||
|
if mail_message.attachments:
|
||||||
|
att = []
|
||||||
|
for a in mail.attachments:
|
||||||
|
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
||||||
|
att.append(
|
||||||
|
f"{a.filename} ({attachment_size})",
|
||||||
|
)
|
||||||
|
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
||||||
|
|
||||||
|
if mail.html:
|
||||||
|
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
||||||
|
|
||||||
|
fmt_text += f"\n\n{strip_text(mail.text)}"
|
||||||
|
|
||||||
|
return fmt_text
|
||||||
|
|
||||||
|
logger.debug("Parsing file %s into an email", document_path.name)
|
||||||
|
mail = self.parse_file_to_message(document_path)
|
||||||
|
|
||||||
|
logger.debug("Building formatted text from email")
|
||||||
|
self._text = build_formatted_text(mail)
|
||||||
|
|
||||||
|
if is_naive(mail.date):
|
||||||
|
self._date = make_aware(mail.date)
|
||||||
|
else:
|
||||||
|
self._date = mail.date
|
||||||
|
|
||||||
|
logger.debug("Creating a PDF from the email")
|
||||||
|
if self._mailrule_id:
|
||||||
|
rule = MailRule.objects.get(pk=self._mailrule_id)
|
||||||
|
self._archive_path = self.generate_pdf(
|
||||||
|
mail,
|
||||||
|
MailRule.PdfLayout(rule.pdf_layout),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._archive_path = self.generate_pdf(mail)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if parse has not been called yet.
|
||||||
|
"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Date from the email headers, or None if not detected.
|
||||||
|
"""
|
||||||
|
return self._date
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated archive PDF, or None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||||
|
been called yet.
|
||||||
|
"""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
file_name: str | None = None,
|
||||||
|
) -> Path:
|
||||||
|
"""Generate a thumbnail from the PDF rendition of the email.
|
||||||
|
|
||||||
|
Converts the document to PDF first if not already done.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
file_name:
|
||||||
|
Kept for backward compatibility; not used.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temporary directory.
|
||||||
|
"""
|
||||||
|
if not self._archive_path:
|
||||||
|
self._archive_path = self.generate_pdf(
|
||||||
|
self.parse_file_to_message(document_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self._archive_path,
|
||||||
|
self._tempdir,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in the document.
|
||||||
|
|
||||||
|
Counts pages in the archive PDF produced by a preceding parse()
|
||||||
|
call. Returns ``None`` if parse() has not been called yet or if
|
||||||
|
no archive was produced.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count of the archive PDF, or ``None``.
|
||||||
|
"""
|
||||||
|
if self._archive_path is not None:
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract metadata from the email headers.
|
||||||
|
|
||||||
|
Returns email headers as metadata entries with prefix "header",
|
||||||
|
plus summary entries for attachments and date.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Sorted list of metadata entries, or ``[]`` on parse failure.
|
||||||
|
"""
|
||||||
|
result: list[MetadataEntry] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
mail = self.parse_file_to_message(document_path)
|
||||||
|
except ParseError as e:
|
||||||
|
logger.warning(
|
||||||
|
"Error while fetching document metadata for %s: %s",
|
||||||
|
document_path,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
for key, header_values in mail.headers.items():
|
||||||
|
value = ", ".join(header_values)
|
||||||
|
try:
|
||||||
|
value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as e: # pragma: no cover
|
||||||
|
logger.debug("Skipping header %s: %s", key, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"namespace": "",
|
||||||
|
"prefix": "header",
|
||||||
|
"key": key,
|
||||||
|
"value": value,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"namespace": "",
|
||||||
|
"prefix": "",
|
||||||
|
"key": "attachments",
|
||||||
|
"value": ", ".join(
|
||||||
|
f"{attachment.filename}"
|
||||||
|
f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
|
||||||
|
for attachment in mail.attachments
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"namespace": "",
|
||||||
|
"prefix": "",
|
||||||
|
"key": "date",
|
||||||
|
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Email-specific methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
||||||
|
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
|
||||||
|
if settings.OCR_OUTPUT_TYPE in {
|
||||||
|
OutputTypeChoices.PDF_A,
|
||||||
|
OutputTypeChoices.PDF_A2,
|
||||||
|
}:
|
||||||
|
return PdfAFormat.A2b
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||||
|
logger.warning(
|
||||||
|
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||||
|
)
|
||||||
|
return PdfAFormat.A2b
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||||
|
return PdfAFormat.A3b
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||||
|
"""Parse the given .eml file into a MailMessage object.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath:
|
||||||
|
Path to the .eml file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MailMessage
|
||||||
|
Parsed mail message.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If the file cannot be parsed or is missing required fields.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with filepath.open("rb") as eml:
|
||||||
|
parsed = MailMessage.from_bytes(eml.read())
|
||||||
|
if parsed.from_values is None:
|
||||||
|
raise ParseError(
|
||||||
|
f"Could not parse {filepath}: Missing 'from'",
|
||||||
|
)
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Could not parse {filepath}: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
def tika_parse(self, html: str) -> str:
|
||||||
|
"""Send HTML content to the Tika server for text extraction.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
html:
|
||||||
|
HTML string to parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
Extracted plain text.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If the Tika server cannot be reached or returns an error.
|
||||||
|
"""
|
||||||
|
logger.info("Sending content to Tika server")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||||
|
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||||
|
|
||||||
|
if parsed.content is not None:
|
||||||
|
return parsed.content.strip()
|
||||||
|
return ""
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Could not parse content with tika server at "
|
||||||
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
def generate_pdf(
|
||||||
|
self,
|
||||||
|
mail_message: MailMessage,
|
||||||
|
pdf_layout: MailRule.PdfLayout | None = None,
|
||||||
|
) -> Path:
|
||||||
|
"""Generate a PDF from the email message.
|
||||||
|
|
||||||
|
Creates separate PDFs for the email body and HTML content, then
|
||||||
|
merges them according to the requested layout.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mail_message:
|
||||||
|
Parsed email message.
|
||||||
|
pdf_layout:
|
||||||
|
Layout option for the PDF. Falls back to the
|
||||||
|
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated PDF inside the temporary directory.
|
||||||
|
"""
|
||||||
|
archive_path = Path(self._tempdir) / "merged.pdf"
|
||||||
|
|
||||||
|
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||||
|
|
||||||
|
if pdf_layout is None:
|
||||||
|
pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
|
||||||
|
|
||||||
|
# If no HTML content, create the PDF from the message.
|
||||||
|
# Otherwise, create 2 PDFs and merge them with Gotenberg.
|
||||||
|
if not mail_message.html:
|
||||||
|
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||||
|
else:
|
||||||
|
pdf_of_html_content = self.generate_pdf_from_html(
|
||||||
|
mail_message.html,
|
||||||
|
mail_message.attachments,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug("Merging email text and HTML content into single PDF")
|
||||||
|
|
||||||
|
with (
|
||||||
|
GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
) as client,
|
||||||
|
client.merge.merge() as route,
|
||||||
|
):
|
||||||
|
# Configure requested PDF/A formatting, if any
|
||||||
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
|
if pdf_a_format is not None:
|
||||||
|
route.pdf_format(pdf_a_format)
|
||||||
|
|
||||||
|
match pdf_layout:
|
||||||
|
case MailRule.PdfLayout.HTML_TEXT:
|
||||||
|
route.merge([pdf_of_html_content, mail_pdf_file])
|
||||||
|
case MailRule.PdfLayout.HTML_ONLY:
|
||||||
|
route.merge([pdf_of_html_content])
|
||||||
|
case MailRule.PdfLayout.TEXT_ONLY:
|
||||||
|
route.merge([mail_pdf_file])
|
||||||
|
case MailRule.PdfLayout.TEXT_HTML | _:
|
||||||
|
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = route.run()
|
||||||
|
archive_path.write_bytes(response.content)
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while merging email HTML into PDF: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
return archive_path
|
||||||
|
|
||||||
|
def mail_to_html(self, mail: MailMessage) -> Path:
|
||||||
|
"""Convert the given email into an HTML file using a template.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mail:
|
||||||
|
Parsed mail message.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the rendered HTML file inside the temporary directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def clean_html(text: str) -> str:
|
||||||
|
"""Attempt to clean, escape, and linkify the given HTML string."""
|
||||||
|
if isinstance(text, list):
|
||||||
|
text = "\n".join([str(e) for e in text])
|
||||||
|
if not isinstance(text, str):
|
||||||
|
text = str(text)
|
||||||
|
text = escape(text)
|
||||||
|
text = clean(text)
|
||||||
|
text = linkify(text, parse_email=True)
|
||||||
|
text = text.replace("\n", "<br>")
|
||||||
|
return text
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
data["subject"] = clean_html(mail.subject)
|
||||||
|
if data["subject"]:
|
||||||
|
data["subject_label"] = "Subject"
|
||||||
|
data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
|
||||||
|
if data["from"]:
|
||||||
|
data["from_label"] = "From"
|
||||||
|
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
|
||||||
|
if data["to"]:
|
||||||
|
data["to_label"] = "To"
|
||||||
|
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
|
||||||
|
if data["cc"]:
|
||||||
|
data["cc_label"] = "CC"
|
||||||
|
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
|
||||||
|
if data["bcc"]:
|
||||||
|
data["bcc_label"] = "BCC"
|
||||||
|
|
||||||
|
att = []
|
||||||
|
for a in mail.attachments:
|
||||||
|
att.append(
|
||||||
|
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
|
||||||
|
)
|
||||||
|
data["attachments"] = clean_html(", ".join(att))
|
||||||
|
if data["attachments"]:
|
||||||
|
data["attachments_label"] = "Attachments"
|
||||||
|
|
||||||
|
data["date"] = clean_html(
|
||||||
|
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
|
||||||
|
)
|
||||||
|
data["content"] = clean_html(mail.text.strip())
|
||||||
|
|
||||||
|
from django.template.loader import render_to_string
|
||||||
|
|
||||||
|
html_file = Path(self._tempdir) / "email_as_html.html"
|
||||||
|
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
||||||
|
|
||||||
|
return html_file
|
||||||
|
|
||||||
|
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
||||||
|
"""Create a PDF from the email body using an HTML template and Gotenberg.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mail:
|
||||||
|
Parsed mail message.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated PDF inside the temporary directory.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
logger.info("Converting mail to PDF")
|
||||||
|
|
||||||
|
css_file = (
|
||||||
|
Path(__file__).parent.parent.parent
|
||||||
|
/ "paperless_mail"
|
||||||
|
/ "templates"
|
||||||
|
/ "output.css"
|
||||||
|
)
|
||||||
|
email_html_file = self.mail_to_html(mail)
|
||||||
|
|
||||||
|
with (
|
||||||
|
GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
) as client,
|
||||||
|
client.chromium.html_to_pdf() as route,
|
||||||
|
):
|
||||||
|
# Configure requested PDF/A formatting, if any
|
||||||
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
|
if pdf_a_format is not None:
|
||||||
|
route.pdf_format(pdf_a_format)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = (
|
||||||
|
route.index(email_html_file)
|
||||||
|
.resource(css_file)
|
||||||
|
.margins(
|
||||||
|
PageMarginsType(
|
||||||
|
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.size(A4)
|
||||||
|
.scale(1.0)
|
||||||
|
.run()
|
||||||
|
)
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while converting email to PDF: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
|
||||||
|
email_as_pdf_file.write_bytes(response.content)
|
||||||
|
|
||||||
|
return email_as_pdf_file
|
||||||
|
|
||||||
|
def generate_pdf_from_html(
|
||||||
|
self,
|
||||||
|
orig_html: str,
|
||||||
|
attachments: list[MailAttachment],
|
||||||
|
) -> Path:
|
||||||
|
"""Generate a PDF from the HTML content of the email.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
orig_html:
|
||||||
|
Raw HTML string from the email body.
|
||||||
|
attachments:
|
||||||
|
List of email attachments (used as inline resources).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated PDF inside the temporary directory.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def clean_html_script(text: str) -> str:
|
||||||
|
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||||
|
text = compiled_open.sub("<div hidden ", text)
|
||||||
|
|
||||||
|
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||||
|
text = compiled_close.sub("</div", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
logger.info("Converting message html to PDF")
|
||||||
|
|
||||||
|
tempdir = Path(self._tempdir)
|
||||||
|
|
||||||
|
html_clean = clean_html_script(orig_html)
|
||||||
|
html_clean_file = tempdir / "index.html"
|
||||||
|
html_clean_file.write_text(html_clean)
|
||||||
|
|
||||||
|
with (
|
||||||
|
GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
) as client,
|
||||||
|
client.chromium.html_to_pdf() as route,
|
||||||
|
):
|
||||||
|
# Configure requested PDF/A formatting, if any
|
||||||
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
|
if pdf_a_format is not None:
|
||||||
|
route.pdf_format(pdf_a_format)
|
||||||
|
|
||||||
|
# Add attachments as resources, cleaning the filename and replacing
|
||||||
|
# it in the index file for inclusion
|
||||||
|
for attachment in attachments:
|
||||||
|
# Clean the attachment name to be valid
|
||||||
|
name_cid = f"cid:{attachment.content_id}"
|
||||||
|
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||||
|
|
||||||
|
# Write attachment payload to a temp file
|
||||||
|
temp_file = tempdir / name_clean
|
||||||
|
temp_file.write_bytes(attachment.payload)
|
||||||
|
|
||||||
|
route.resource(temp_file)
|
||||||
|
|
||||||
|
# Replace as needed the name with the clean name
|
||||||
|
html_clean = html_clean.replace(name_cid, name_clean)
|
||||||
|
|
||||||
|
# Now store the cleaned up HTML version
|
||||||
|
html_clean_file = tempdir / "index.html"
|
||||||
|
html_clean_file.write_text(html_clean)
|
||||||
|
# This is our index file, the main page basically
|
||||||
|
route.index(html_clean_file)
|
||||||
|
|
||||||
|
# Set page size, margins
|
||||||
|
route.margins(
|
||||||
|
PageMarginsType(
|
||||||
|
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||||
|
),
|
||||||
|
).size(A4).scale(1.0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = route.run()
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while converting document to PDF: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
html_pdf = tempdir / "html.pdf"
|
||||||
|
html_pdf.write_bytes(response.content)
|
||||||
|
return html_pdf
|
||||||
@@ -33,6 +33,7 @@ name, version, author, url, supported_mime_types (callable), score (callable).
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
@@ -49,6 +50,7 @@ logger = logging.getLogger("paperless.parsers.registry")
|
|||||||
|
|
||||||
_registry: ParserRegistry | None = None
|
_registry: ParserRegistry | None = None
|
||||||
_discovery_complete: bool = False
|
_discovery_complete: bool = False
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
# Attribute names that every registered external parser class must expose.
|
# Attribute names that every registered external parser class must expose.
|
||||||
_REQUIRED_ATTRS: tuple[str, ...] = (
|
_REQUIRED_ATTRS: tuple[str, ...] = (
|
||||||
@@ -74,7 +76,6 @@ def get_parser_registry() -> ParserRegistry:
|
|||||||
1. Creates a new ParserRegistry.
|
1. Creates a new ParserRegistry.
|
||||||
2. Calls register_defaults to install built-in parsers.
|
2. Calls register_defaults to install built-in parsers.
|
||||||
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
||||||
4. Calls log_summary to emit a startup summary.
|
|
||||||
|
|
||||||
Subsequent calls return the same instance immediately.
|
Subsequent calls return the same instance immediately.
|
||||||
|
|
||||||
@@ -85,13 +86,14 @@ def get_parser_registry() -> ParserRegistry:
|
|||||||
"""
|
"""
|
||||||
global _registry, _discovery_complete
|
global _registry, _discovery_complete
|
||||||
|
|
||||||
|
with _lock:
|
||||||
if _registry is None:
|
if _registry is None:
|
||||||
_registry = ParserRegistry()
|
r = ParserRegistry()
|
||||||
_registry.register_defaults()
|
r.register_defaults()
|
||||||
|
_registry = r
|
||||||
|
|
||||||
if not _discovery_complete:
|
if not _discovery_complete:
|
||||||
_registry.discover()
|
_registry.discover()
|
||||||
_registry.log_summary()
|
|
||||||
_discovery_complete = True
|
_discovery_complete = True
|
||||||
|
|
||||||
return _registry
|
return _registry
|
||||||
@@ -113,9 +115,11 @@ def init_builtin_parsers() -> None:
|
|||||||
"""
|
"""
|
||||||
global _registry
|
global _registry
|
||||||
|
|
||||||
|
with _lock:
|
||||||
if _registry is None:
|
if _registry is None:
|
||||||
_registry = ParserRegistry()
|
r = ParserRegistry()
|
||||||
_registry.register_defaults()
|
r.register_defaults()
|
||||||
|
_registry = r
|
||||||
|
|
||||||
|
|
||||||
def reset_parser_registry() -> None:
|
def reset_parser_registry() -> None:
|
||||||
@@ -193,9 +197,17 @@ class ParserRegistry:
|
|||||||
that log output is predictable; scoring determines which parser wins
|
that log output is predictable; scoring determines which parser wins
|
||||||
at runtime regardless of registration order.
|
at runtime regardless of registration order.
|
||||||
"""
|
"""
|
||||||
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
self.register_builtin(TextDocumentParser)
|
self.register_builtin(TextDocumentParser)
|
||||||
|
self.register_builtin(RemoteDocumentParser)
|
||||||
|
self.register_builtin(TikaDocumentParser)
|
||||||
|
self.register_builtin(MailDocumentParser)
|
||||||
|
self.register_builtin(RasterisedDocumentParser)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Discovery
|
# Discovery
|
||||||
@@ -296,6 +308,23 @@ class ParserRegistry:
|
|||||||
getattr(cls, "url", "unknown"),
|
getattr(cls, "url", "unknown"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Inspection helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||||
|
"""Return all registered parser classes (external first, then builtins).
|
||||||
|
|
||||||
|
Used by compatibility wrappers that need to iterate every parser to
|
||||||
|
compute the full set of supported MIME types and file extensions.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[type[ParserProtocol]]
|
||||||
|
External parsers followed by built-in parsers.
|
||||||
|
"""
|
||||||
|
return [*self._external, *self._builtins]
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Parser resolution
|
# Parser resolution
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -326,7 +355,7 @@ class ParserRegistry:
|
|||||||
mime_type:
|
mime_type:
|
||||||
The detected MIME type of the file.
|
The detected MIME type of the file.
|
||||||
filename:
|
filename:
|
||||||
The original filename, including extension.
|
The original filename, including extension. May be empty in some cases
|
||||||
path:
|
path:
|
||||||
Optional filesystem path to the file. Forwarded to each
|
Optional filesystem path to the file. Forwarded to each
|
||||||
parser's score method.
|
parser's score method.
|
||||||
|
|||||||
433
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,433 @@
|
|||||||
|
"""
|
||||||
|
Built-in remote-OCR document parser.
|
||||||
|
|
||||||
|
Handles documents by sending them to a configured remote OCR engine
|
||||||
|
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||||
|
the extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
|
||||||
|
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||||
|
is effectively invisible to the registry — the tesseract parser handles
|
||||||
|
these MIME types instead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.remote")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/tiff": ".tiff",
|
||||||
|
"image/bmp": ".bmp",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteEngineConfig:
|
||||||
|
"""Holds and validates the remote OCR engine configuration."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
engine: str | None,
|
||||||
|
api_key: str | None = None,
|
||||||
|
endpoint: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.engine = engine
|
||||||
|
self.api_key = api_key
|
||||||
|
self.endpoint = endpoint
|
||||||
|
|
||||||
|
def engine_is_valid(self) -> bool:
|
||||||
|
"""Return True when the engine is known and fully configured."""
|
||||||
|
return (
|
||||||
|
self.engine in ("azureai",)
|
||||||
|
and self.api_key is not None
|
||||||
|
and not (self.engine == "azureai" and self.endpoint is None)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteDocumentParser:
|
||||||
|
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||||
|
|
||||||
|
This parser sends documents to a remote engine that returns both
|
||||||
|
extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
It does not depend on Tesseract or ocrmypdf.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Remote OCR Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser can handle.
|
||||||
|
|
||||||
|
The full set is always returned regardless of whether a remote
|
||||||
|
engine is configured. The ``score()`` method handles the
|
||||||
|
"am I active?" logic by returning ``None`` when not configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file, or None.
|
||||||
|
|
||||||
|
Returns ``None`` when no valid remote engine is configured,
|
||||||
|
making the parser invisible to the registry for this file.
|
||||||
|
When configured, returns 20 — higher than the Tesseract parser's
|
||||||
|
default of 10 — so the remote engine takes priority.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
20 when the remote engine is configured and the MIME type is
|
||||||
|
supported, otherwise None.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
return None
|
||||||
|
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||||
|
return None
|
||||||
|
return 20
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — the remote engine always returns a PDF with an
|
||||||
|
embedded text layer that serves as the archive copy.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — all supported originals are displayable by
|
||||||
|
the browser (PDF) or handled via the archive copy (images).
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._logging_group = logging_group
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Send the document to the remote engine and store results.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Ignored — the remote engine always returns a searchable PDF,
|
||||||
|
which is stored as the archive copy regardless of this flag.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
logger.warning(
|
||||||
|
"No valid remote parser engine is configured, content will be empty.",
|
||||||
|
)
|
||||||
|
self._text = ""
|
||||||
|
return
|
||||||
|
|
||||||
|
if config.engine == "azureai":
|
||||||
|
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse."""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Always None — the remote parser does not detect dates.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated archive PDF, or None."""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate a thumbnail image for the document.
|
||||||
|
|
||||||
|
Uses the archive PDF produced by the remote engine when available,
|
||||||
|
otherwise falls back to the original document path (PDF inputs).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temp directory.
|
||||||
|
"""
|
||||||
|
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||||
|
# it will move to paperless.parsers.utils when the tesseract
|
||||||
|
# parser is migrated in a later phase.
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self._archive_path or document_path,
|
||||||
|
self._tempdir,
|
||||||
|
self._logging_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF document.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return None
|
||||||
|
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(document_path, log=logger)
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata from the document.
|
||||||
|
|
||||||
|
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||||
|
Returns ``[]`` for all other MIME types.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the file to extract metadata from.
|
||||||
|
mime_type:
|
||||||
|
MIME type of the file. May be ``"application/pdf"`` when
|
||||||
|
called for the archive version of an image original.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return []
|
||||||
|
|
||||||
|
from paperless.parsers.utils import extract_pdf_metadata
|
||||||
|
|
||||||
|
return extract_pdf_metadata(document_path, log=logger)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _azure_ai_vision_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
config: RemoteEngineConfig,
|
||||||
|
) -> str | None:
|
||||||
|
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||||
|
|
||||||
|
Downloads the searchable PDF output from Azure and stores it at
|
||||||
|
``self._archive_path``. Returns the extracted text content, or
|
||||||
|
``None`` on failure (the error is logged).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file:
|
||||||
|
Absolute path to the document to analyse.
|
||||||
|
config:
|
||||||
|
Validated remote engine configuration.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if the Azure call failed.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# Callers must have already validated config via engine_is_valid():
|
||||||
|
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||||
|
# endpoint is not None, so these casts are provably safe.
|
||||||
|
assert config.endpoint is not None
|
||||||
|
assert config.api_key is not None
|
||||||
|
|
||||||
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||||
|
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
|
client = DocumentIntelligenceClient(
|
||||||
|
endpoint=config.endpoint,
|
||||||
|
credential=AzureKeyCredential(config.api_key),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with file.open("rb") as f:
|
||||||
|
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||||
|
poller = client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
body=analyze_request,
|
||||||
|
output_content_format=DocumentContentFormat.TEXT,
|
||||||
|
output=[AnalyzeOutputOption.PDF],
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
poller.wait()
|
||||||
|
result_id = poller.details["operation_id"]
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
|
self._archive_path = self._tempdir / "archive.pdf"
|
||||||
|
with self._archive_path.open("wb") as f:
|
||||||
|
for chunk in client.get_analyze_result_pdf(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
result_id=result_id,
|
||||||
|
):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
return result.content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
return None
|
||||||
@@ -1,13 +1,18 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Any
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import make_thumbnail_from_pdf
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
from documents.utils import maybe_override_pixel_limit
|
from documents.utils import maybe_override_pixel_limit
|
||||||
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
|
|||||||
from paperless.models import ArchiveFileChoices
|
from paperless.models import ArchiveFileChoices
|
||||||
from paperless.models import CleanChoices
|
from paperless.models import CleanChoices
|
||||||
from paperless.models import ModeChoices
|
from paperless.models import ModeChoices
|
||||||
|
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/tiff": ".tif",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
"image/bmp": ".bmp",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
"image/heic": ".heic",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class NoTextFoundException(Exception):
|
class NoTextFoundException(Exception):
|
||||||
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RasterisedDocumentParser(DocumentParser):
|
class RasterisedDocumentParser:
|
||||||
"""
|
"""
|
||||||
This parser uses Tesseract to try and get some text out of a rasterised
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging_name = "paperless.parsing.tesseract"
|
name: str = "Paperless-ngx Tesseract OCR Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
def get_settings(self) -> OcrConfig:
|
# ------------------------------------------------------------------
|
||||||
"""
|
# Class methods
|
||||||
This parser uses the OCR configuration settings to parse documents
|
# ------------------------------------------------------------------
|
||||||
"""
|
|
||||||
return OcrConfig()
|
|
||||||
|
|
||||||
def get_page_count(self, document_path, mime_type):
|
@classmethod
|
||||||
page_count = None
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
if mime_type == "application/pdf":
|
return _SUPPORTED_MIME_TYPES
|
||||||
try:
|
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
with pikepdf.Pdf.open(document_path) as pdf:
|
@classmethod
|
||||||
page_count = len(pdf.pages)
|
def score(
|
||||||
except Exception as e:
|
cls,
|
||||||
self.log.warning(
|
mime_type: str,
|
||||||
f"Unable to determine PDF page count {document_path}: {e}",
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
)
|
)
|
||||||
return page_count
|
self.settings = OcrConfig()
|
||||||
|
self.archive_path: Path | None = None
|
||||||
|
self.text: str | None = None
|
||||||
|
self.date: datetime.datetime | None = None
|
||||||
|
self.log = logger
|
||||||
|
|
||||||
def extract_metadata(self, document_path, mime_type):
|
def __enter__(self) -> Self:
|
||||||
result = []
|
return self
|
||||||
if mime_type == "application/pdf":
|
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self.tempdir)
|
||||||
|
shutil.rmtree(self.tempdir, ignore_errors=True)
|
||||||
|
|
||||||
pdf = pikepdf.open(document_path)
|
# ------------------------------------------------------------------
|
||||||
meta = pdf.open_metadata()
|
# Core parsing interface
|
||||||
for key, value in meta.items():
|
# ------------------------------------------------------------------
|
||||||
if isinstance(value, list):
|
|
||||||
value = " ".join([str(e) for e in value])
|
|
||||||
value = str(value)
|
|
||||||
try:
|
|
||||||
m = namespace_pattern.match(key)
|
|
||||||
if m is None: # pragma: no cover
|
|
||||||
continue
|
|
||||||
namespace = m.group(1)
|
|
||||||
key_value = m.group(2)
|
|
||||||
try:
|
|
||||||
namespace.encode("utf-8")
|
|
||||||
key_value.encode("utf-8")
|
|
||||||
except UnicodeEncodeError as e: # pragma: no cover
|
|
||||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
|
||||||
continue
|
|
||||||
result.append(
|
|
||||||
{
|
|
||||||
"namespace": namespace,
|
|
||||||
"prefix": meta.REVERSE_NS[namespace],
|
|
||||||
"key": key_value,
|
|
||||||
"value": value,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
self.log.warning(
|
|
||||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
return self.date
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return self.archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail, page count, and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
return make_thumbnail_from_pdf(
|
return make_thumbnail_from_pdf(
|
||||||
self.archive_path or document_path,
|
self.archive_path or Path(document_path),
|
||||||
self.tempdir,
|
self.tempdir,
|
||||||
self.logging_group,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_image(self, mime_type) -> bool:
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(Path(document_path), log=self.log)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return []
|
||||||
|
|
||||||
|
from paperless.parsers.utils import extract_pdf_metadata
|
||||||
|
|
||||||
|
return extract_pdf_metadata(Path(document_path), log=self.log)
|
||||||
|
|
||||||
|
def is_image(self, mime_type: str) -> bool:
|
||||||
return mime_type in [
|
return mime_type in [
|
||||||
"image/png",
|
"image/png",
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"image/heic",
|
"image/heic",
|
||||||
]
|
]
|
||||||
|
|
||||||
def has_alpha(self, image) -> bool:
|
def has_alpha(self, image: Path) -> bool:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
return im.mode in ("RGBA", "LA")
|
return im.mode in ("RGBA", "LA")
|
||||||
|
|
||||||
def remove_alpha(self, image_path: str) -> Path:
|
def remove_alpha(self, image_path: Path) -> Path:
|
||||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||||
run_subprocess(
|
run_subprocess(
|
||||||
[
|
[
|
||||||
settings.CONVERT_BINARY,
|
settings.CONVERT_BINARY,
|
||||||
"-alpha",
|
"-alpha",
|
||||||
"off",
|
"off",
|
||||||
image_path,
|
str(image_path),
|
||||||
no_alpha_image,
|
str(no_alpha_image),
|
||||||
],
|
],
|
||||||
logger=self.log,
|
logger=self.log,
|
||||||
)
|
)
|
||||||
return no_alpha_image
|
return no_alpha_image
|
||||||
|
|
||||||
def get_dpi(self, image) -> int | None:
|
def get_dpi(self, image: Path) -> int | None:
|
||||||
try:
|
try:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
x, _ = im.info["dpi"]
|
x, _ = im.info["dpi"]
|
||||||
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def calculate_a4_dpi(self, image) -> int | None:
|
def calculate_a4_dpi(self, image: Path) -> int | None:
|
||||||
try:
|
try:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
width, _ = im.size
|
width, _ = im.size
|
||||||
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
sidecar_file: Path | None,
|
sidecar_file: Path | None,
|
||||||
pdf_file: Path,
|
pdf_file: Path,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
|
text: str | None = None
|
||||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
# the whole text, so do not utilize it in that case
|
# the whole text, so do not utilize it in that case
|
||||||
if (
|
if (
|
||||||
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
and sidecar_file.is_file()
|
and sidecar_file.is_file()
|
||||||
and self.settings.mode != "redo"
|
and self.settings.mode != "redo"
|
||||||
):
|
):
|
||||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
text = read_file_handle_unicode_errors(sidecar_file)
|
||||||
|
|
||||||
if "[OCR skipped on page" not in text:
|
if "[OCR skipped on page" not in text:
|
||||||
# This happens when there's already text in the input file.
|
# This happens when there's already text in the input file.
|
||||||
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"-layout",
|
"-layout",
|
||||||
"-enc",
|
"-enc",
|
||||||
"UTF-8",
|
"UTF-8",
|
||||||
pdf_file,
|
str(pdf_file),
|
||||||
tmp.name,
|
tmp.name,
|
||||||
],
|
],
|
||||||
logger=self.log,
|
logger=self.log,
|
||||||
)
|
)
|
||||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||||
|
|
||||||
return post_process_text(text)
|
return post_process_text(text)
|
||||||
|
|
||||||
@@ -211,17 +283,15 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
def construct_ocrmypdf_parameters(
|
def construct_ocrmypdf_parameters(
|
||||||
self,
|
self,
|
||||||
input_file,
|
input_file: Path,
|
||||||
mime_type,
|
mime_type: str,
|
||||||
output_file,
|
output_file: Path,
|
||||||
sidecar_file,
|
sidecar_file: Path,
|
||||||
*,
|
*,
|
||||||
safe_fallback=False,
|
safe_fallback: bool = False,
|
||||||
):
|
) -> dict[str, Any]:
|
||||||
if TYPE_CHECKING:
|
ocrmypdf_args: dict[str, Any] = {
|
||||||
assert isinstance(self.settings, OcrConfig)
|
"input_file_or_options": input_file,
|
||||||
ocrmypdf_args = {
|
|
||||||
"input_file": input_file,
|
|
||||||
"output_file": output_file,
|
"output_file": output_file,
|
||||||
# need to use threads, since this will be run in daemonized
|
# need to use threads, since this will be run in daemonized
|
||||||
# processes via the task library.
|
# processes via the task library.
|
||||||
@@ -285,7 +355,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"for compatibility with img2pdf",
|
"for compatibility with img2pdf",
|
||||||
)
|
)
|
||||||
# Replace the input file with the non-alpha
|
# Replace the input file with the non-alpha
|
||||||
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
ocrmypdf_args["input_file_or_options"] = self.remove_alpha(input_file)
|
||||||
|
|
||||||
if dpi:
|
if dpi:
|
||||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||||
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return ocrmypdf_args
|
return ocrmypdf_args
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None) -> None:
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
# This forces tesseract to use one core per page.
|
# This forces tesseract to use one core per page.
|
||||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
VALID_TEXT_LENGTH = 50
|
VALID_TEXT_LENGTH = 50
|
||||||
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
def post_process_text(text):
|
def post_process_text(text: str | None) -> str | None:
|
||||||
if not text:
|
if not text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
|
|||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
|
|
||||||
from paperless.parsers import MetadataEntry
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.parsing.text")
|
logger = logging.getLogger("paperless.parsing.text")
|
||||||
|
|
||||||
@@ -156,6 +157,9 @@ class TextDocumentParser:
|
|||||||
# Core parsing interface
|
# Core parsing interface
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self,
|
self,
|
||||||
document_path: Path,
|
document_path: Path,
|
||||||
|
|||||||
452
src/paperless/parsers/tika.py
Normal file
@@ -0,0 +1,452 @@
|
|||||||
|
"""
|
||||||
|
Built-in Tika document parser.
|
||||||
|
|
||||||
|
Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
|
||||||
|
sending them to an Apache Tika server for text extraction and a Gotenberg
|
||||||
|
server for PDF conversion. Because the source formats cannot be rendered by
|
||||||
|
a browser natively, the parser always produces a PDF rendition for display.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from contextlib import ExitStack
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
from gotenberg_client import GotenbergClient
|
||||||
|
from gotenberg_client.options import PdfAFormat
|
||||||
|
from tika_client import TikaClient
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from paperless.config import OutputTypeConfig
|
||||||
|
from paperless.models import OutputTypeChoices
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.tika")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/msword": ".doc",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||||
|
"application/vnd.ms-excel": ".xls",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||||
|
"application/vnd.ms-powerpoint": ".ppt",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
||||||
|
"application/vnd.oasis.opendocument.presentation": ".odp",
|
||||||
|
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
||||||
|
"application/vnd.oasis.opendocument.text": ".odt",
|
||||||
|
"application/vnd.oasis.opendocument.graphics": ".odg",
|
||||||
|
"text/rtf": ".rtf",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TikaDocumentParser:
|
||||||
|
"""Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
|
||||||
|
|
||||||
|
Text extraction is handled by the Tika server. PDF conversion for display
|
||||||
|
is handled by Gotenberg (LibreOffice route). Because the source formats
|
||||||
|
cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
|
||||||
|
True and the PDF is always produced regardless of the ``produce_archive``
|
||||||
|
flag passed to ``parse``.
|
||||||
|
|
||||||
|
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
|
||||||
|
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
|
||||||
|
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
|
||||||
|
``ExitStack.close()`` in ``__exit__``. The parser must always be used
|
||||||
|
as a context manager.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Tika Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser handles.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file.
|
||||||
|
|
||||||
|
Returns ``None`` when Tika integration is disabled so the registry
|
||||||
|
skips this parser entirely.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
|
||||||
|
"""
|
||||||
|
if not settings.TIKA_ENABLED:
|
||||||
|
return None
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — Tika produces a display PDF, not an OCR archive.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — Office formats cannot be rendered natively in a
|
||||||
|
browser, so a PDF conversion is always required for display.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
self._date: datetime.datetime | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
self._exit_stack = ExitStack()
|
||||||
|
self._tika_client: TikaClient | None = None
|
||||||
|
self._gotenberg_client: GotenbergClient | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
self._tika_client = self._exit_stack.enter_context(
|
||||||
|
TikaClient(
|
||||||
|
tika_url=settings.TIKA_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self._gotenberg_client = self._exit_stack.enter_context(
|
||||||
|
GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
self._exit_stack.close()
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Send the document to Tika for text extraction and Gotenberg for PDF.
|
||||||
|
|
||||||
|
Because ``requires_pdf_rendition`` is True the PDF conversion is
|
||||||
|
always performed — the ``produce_archive`` flag is intentionally
|
||||||
|
ignored.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Accepted for protocol compatibility but ignored; the PDF rendition
|
||||||
|
is always produced since the source format cannot be displayed
|
||||||
|
natively in the browser.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Tika or Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._tika_client is not None
|
||||||
|
|
||||||
|
logger.info("Sending %s to Tika server", document_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
parsed = self._tika_client.tika.as_text.from_file(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
except httpx.HTTPStatusError as err:
|
||||||
|
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||||
|
# Tika fails with some files as multi-part form data
|
||||||
|
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||||
|
parsed = self._tika_client.tika.as_text.from_buffer(
|
||||||
|
document_path.read_bytes(),
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
else: # pragma: no cover
|
||||||
|
raise
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Could not parse {document_path} with tika server at "
|
||||||
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
self._text = parsed.content
|
||||||
|
if self._text is not None:
|
||||||
|
self._text = self._text.strip()
|
||||||
|
|
||||||
|
self._date = parsed.created
|
||||||
|
if self._date is not None and timezone.is_naive(self._date):
|
||||||
|
self._date = timezone.make_aware(self._date)
|
||||||
|
|
||||||
|
# Always convert — requires_pdf_rendition=True means the browser
|
||||||
|
# cannot display the source format natively.
|
||||||
|
self._archive_path = self._convert_to_pdf(document_path)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if parse has not been called yet.
|
||||||
|
"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Creation date from Tika metadata, or None if not detected.
|
||||||
|
"""
|
||||||
|
return self._date
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated PDF rendition, or None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||||
|
been called yet.
|
||||||
|
"""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate a thumbnail from the PDF rendition of the document.
|
||||||
|
|
||||||
|
Converts the document to PDF first if not already done.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temporary directory.
|
||||||
|
"""
|
||||||
|
if self._archive_path is None:
|
||||||
|
self._archive_path = self._convert_to_pdf(document_path)
|
||||||
|
return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in the document.
|
||||||
|
|
||||||
|
Counts pages in the archive PDF produced by a preceding parse()
|
||||||
|
call. Returns ``None`` if parse() has not been called yet or if
|
||||||
|
no archive was produced.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count of the archive PDF, or ``None``.
|
||||||
|
"""
|
||||||
|
if self._archive_path is not None:
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata via the Tika metadata endpoint.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
All key/value pairs returned by Tika, or ``[]`` on error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._tika_client is not None
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"namespace": "",
|
||||||
|
"prefix": "",
|
||||||
|
"key": key,
|
||||||
|
"value": parsed.data[key],
|
||||||
|
}
|
||||||
|
for key in parsed.data
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Error while fetching document metadata for %s: %s",
|
||||||
|
document_path,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _convert_to_pdf(self, document_path: Path) -> Path:
|
||||||
|
"""Convert the document to PDF using Gotenberg's LibreOffice route.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated PDF inside the temporary directory.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._gotenberg_client is not None
|
||||||
|
|
||||||
|
pdf_path = self._tempdir / "convert.pdf"
|
||||||
|
|
||||||
|
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
|
||||||
|
|
||||||
|
with self._gotenberg_client.libre_office.to_pdf() as route:
|
||||||
|
# Set the output format of the resulting PDF.
|
||||||
|
# OutputTypeConfig reads the database-stored ApplicationConfiguration
|
||||||
|
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
|
||||||
|
output_type = OutputTypeConfig().output_type
|
||||||
|
if output_type in {
|
||||||
|
OutputTypeChoices.PDF_A,
|
||||||
|
OutputTypeChoices.PDF_A2,
|
||||||
|
}:
|
||||||
|
route.pdf_format(PdfAFormat.A2b)
|
||||||
|
elif output_type == OutputTypeChoices.PDF_A1:
|
||||||
|
logger.warning(
|
||||||
|
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||||
|
)
|
||||||
|
route.pdf_format(PdfAFormat.A2b)
|
||||||
|
elif output_type == OutputTypeChoices.PDF_A3:
|
||||||
|
route.pdf_format(PdfAFormat.A3b)
|
||||||
|
|
||||||
|
route.convert(document_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = route.run()
|
||||||
|
pdf_path.write_bytes(response.content)
|
||||||
|
return pdf_path
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while converting document to PDF: {err}",
|
||||||
|
) from err
|
||||||
158
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
"""
|
||||||
|
Shared utilities for Paperless-ngx document parsers.
|
||||||
|
|
||||||
|
Functions here are format-neutral helpers that multiple parsers need.
|
||||||
|
Keeping them here avoids parsers inheriting from each other just to
|
||||||
|
share implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsers.utils")
|
||||||
|
|
||||||
|
|
||||||
|
def read_file_handle_unicode_errors(
|
||||||
|
filepath: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath:
|
||||||
|
Absolute path to the file to read.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings. Falls back to the module-level logger
|
||||||
|
when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
File content as a string, with any invalid UTF-8 sequences replaced
|
||||||
|
by the Unicode replacement character.
|
||||||
|
"""
|
||||||
|
_log = log or logger
|
||||||
|
try:
|
||||||
|
return filepath.read_text(encoding="utf-8")
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
_log.warning("Unicode error during text reading, continuing: %s", e)
|
||||||
|
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_count_for_pdf(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings. Falls back to the module-level logger
|
||||||
|
when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count, or ``None`` if the file cannot be opened or is not a
|
||||||
|
valid PDF.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pikepdf.Pdf.open(document_path) as pdf:
|
||||||
|
return len(pdf.pages)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_metadata(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Reads all XMP metadata entries from the document and returns them as a
|
||||||
|
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||||
|
to open the file or read a specific key is logged and skipped.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings and debug messages. Falls back to the
|
||||||
|
module-level logger when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||||
|
be opened or contains no readable XMP metadata.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
result: list[MetadataEntry] = []
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(str(e) for e in value)
|
||||||
|
value = str(value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
namespace = m.group(1)
|
||||||
|
key_value = m.group(2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
namespace.encode("utf-8")
|
||||||
|
key_value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as enc_err: # pragma: no cover
|
||||||
|
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
MetadataEntry(
|
||||||
|
namespace=namespace,
|
||||||
|
prefix=meta.REVERSE_NS[namespace],
|
||||||
|
key=key_value,
|
||||||
|
value=value,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning(
|
||||||
|
"Error reading metadata key %s value %s: %s",
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -6,6 +6,7 @@ from allauth.mfa.models import Authenticator
|
|||||||
from allauth.mfa.totp.internal.auth import TOTP
|
from allauth.mfa.totp.internal.auth import TOTP
|
||||||
from allauth.socialaccount.models import SocialAccount
|
from allauth.socialaccount.models import SocialAccount
|
||||||
from allauth.socialaccount.models import SocialApp
|
from allauth.socialaccount.models import SocialApp
|
||||||
|
from django.conf import settings
|
||||||
from django.contrib.auth.models import Group
|
from django.contrib.auth.models import Group
|
||||||
from django.contrib.auth.models import Permission
|
from django.contrib.auth.models import Permission
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
@@ -15,6 +16,7 @@ from rest_framework import serializers
|
|||||||
from rest_framework.authtoken.serializers import AuthTokenSerializer
|
from rest_framework.authtoken.serializers import AuthTokenSerializer
|
||||||
|
|
||||||
from paperless.models import ApplicationConfiguration
|
from paperless.models import ApplicationConfiguration
|
||||||
|
from paperless.network import validate_outbound_http_url
|
||||||
from paperless.validators import reject_dangerous_svg
|
from paperless.validators import reject_dangerous_svg
|
||||||
from paperless_mail.serialisers import ObfuscatedPasswordField
|
from paperless_mail.serialisers import ObfuscatedPasswordField
|
||||||
|
|
||||||
@@ -236,6 +238,22 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
|||||||
reject_dangerous_svg(file)
|
reject_dangerous_svg(file)
|
||||||
return file
|
return file
|
||||||
|
|
||||||
|
def validate_llm_endpoint(self, value: str | None) -> str | None:
|
||||||
|
if not value:
|
||||||
|
return value
|
||||||
|
|
||||||
|
try:
|
||||||
|
validate_outbound_http_url(
|
||||||
|
value,
|
||||||
|
allow_internal=settings.LLM_ALLOW_INTERNAL_ENDPOINTS,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise serializers.ValidationError(
|
||||||
|
f"Invalid LLM endpoint: {e.args[0]}, see logs for details",
|
||||||
|
) from e
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
model = ApplicationConfiguration
|
model = ApplicationConfiguration
|
||||||
fields = "__all__"
|
fields = "__all__"
|
||||||
|
|||||||
@@ -121,10 +121,7 @@ INSTALLED_APPS = [
|
|||||||
"django_extensions",
|
"django_extensions",
|
||||||
"paperless",
|
"paperless",
|
||||||
"documents.apps.DocumentsConfig",
|
"documents.apps.DocumentsConfig",
|
||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
@@ -974,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
|||||||
"http://localhost:3000",
|
"http://localhost:3000",
|
||||||
)
|
)
|
||||||
|
|
||||||
if TIKA_ENABLED:
|
# Tika parser is now integrated into the main parser registry
|
||||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
# No separate Django app needed
|
||||||
|
|
||||||
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||||
if AUDIT_LOG_ENABLED:
|
if AUDIT_LOG_ENABLED:
|
||||||
@@ -1112,3 +1109,7 @@ LLM_BACKEND = os.getenv("PAPERLESS_AI_LLM_BACKEND") # "ollama" or "openai"
|
|||||||
LLM_MODEL = os.getenv("PAPERLESS_AI_LLM_MODEL")
|
LLM_MODEL = os.getenv("PAPERLESS_AI_LLM_MODEL")
|
||||||
LLM_API_KEY = os.getenv("PAPERLESS_AI_LLM_API_KEY")
|
LLM_API_KEY = os.getenv("PAPERLESS_AI_LLM_API_KEY")
|
||||||
LLM_ENDPOINT = os.getenv("PAPERLESS_AI_LLM_ENDPOINT")
|
LLM_ENDPOINT = os.getenv("PAPERLESS_AI_LLM_ENDPOINT")
|
||||||
|
LLM_ALLOW_INTERNAL_ENDPOINTS = get_bool_from_env(
|
||||||
|
"PAPERLESS_AI_LLM_ALLOW_INTERNAL_ENDPOINTS",
|
||||||
|
"true",
|
||||||
|
)
|
||||||
|
|||||||
@@ -6,15 +6,29 @@ so it is easy to see which files belong to which test module.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
#: Type for the ``make_tesseract_parser`` fixture factory.
|
||||||
|
MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -74,3 +88,684 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
|||||||
"""
|
"""
|
||||||
with TextDocumentParser() as parser:
|
with TextDocumentParser() as parser:
|
||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||||
|
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
RemoteDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser settings helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||||
|
|
||||||
|
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||||
|
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||||
|
automatically after the test by pytest-django.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object (for chaining further overrides).
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings with no remote engine configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object.
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = None
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tika parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tika_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the Tika parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/tika/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "tika"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_odt_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample ODT file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.odt``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.odt"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_docx_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample DOCX file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.docx``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.docx"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_doc_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample DOC file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.doc``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.doc"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_broken_odt(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a broken ODT file that triggers the multi-part fallback.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/multi-part-broken.odt``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "multi-part-broken.odt"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tika parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||||
|
"""Yield a TikaDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
TikaDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with TikaDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Mail parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def mail_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the mail parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/mail/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "mail"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def broken_email_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a broken/malformed EML sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/broken.eml``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "broken.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a plain-text email sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/simple_text.eml``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "simple_text.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the expected PDF rendition of the plain-text email.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/simple_text.eml.pdf``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "simple_text.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the expected thumbnail for the plain-text email.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/simple_text.eml.pdf.webp``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "simple_text.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to an HTML email sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/html.eml``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "html.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the expected PDF rendition of the HTML email.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/html.eml.pdf``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "html.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the expected thumbnail for the HTML email.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/html.eml.pdf.webp``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "html.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_html_file(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the HTML body of the HTML email sample.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/html.eml.html``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "html.eml.html"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_first(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the first PDF used in PDF-merge tests.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/first.pdf``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "first.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_second(mail_samples_dir: Path) -> Path:
|
||||||
|
"""Path to the second PDF used in PDF-merge tests.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``mail/second.pdf``.
|
||||||
|
"""
|
||||||
|
return mail_samples_dir / "second.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Mail parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mail_parser() -> Generator[MailDocumentParser, None, None]:
|
||||||
|
"""Yield a MailDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
MailDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with MailDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nginx_base_url() -> Generator[str, None, None]:
|
||||||
|
"""
|
||||||
|
The base URL for the nginx HTTP server we expect to be alive
|
||||||
|
"""
|
||||||
|
yield "http://localhost:8080"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tesseract parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tesseract_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the tesseract parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/tesseract/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "tesseract"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def document_webp_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a WebP document sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/document.webp``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "document.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to an encrypted PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/encrypted.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "encrypted.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF with alpha channel in RGB.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF with alpha channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images-alpha.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page PDF with images.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page mixed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-mixed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-mixed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a PNG with alpha channel and no text.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/no-text-alpha.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "no-text-alpha.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a rotated PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/rotated.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "rotated.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to an RTL test PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/rtl-test.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "rtl-test.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a signed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/signed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "signed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG with alpha channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-alpha.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-alpha.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG without DPI information.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-no-dpi.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-no-dpi.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple BMP sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.bmp``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.bmp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_gif_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple GIF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.gif``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.gif"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_heic_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple HEIC sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.heic``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.heic"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple JPG sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.jpg``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.jpg"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_tif_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple TIF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.tif``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.tif"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a single-page mixed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/single-page-mixed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "single-page-mixed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a PDF with form sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/with-form.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "with-form.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tesseract parser instance and settings helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||||
|
"""Return a MagicMock with all OcrConfig fields set to None.
|
||||||
|
|
||||||
|
This allows the parser to fall back to Django settings instead of
|
||||||
|
hitting the database.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MagicMock
|
||||||
|
Mock config with all fields as None
|
||||||
|
"""
|
||||||
|
return mocker.MagicMock(
|
||||||
|
output_type=None,
|
||||||
|
pages=None,
|
||||||
|
language=None,
|
||||||
|
mode=None,
|
||||||
|
skip_archive_file=None,
|
||||||
|
image_dpi=None,
|
||||||
|
unpaper_clean=None,
|
||||||
|
deskew=None,
|
||||||
|
rotate_pages=None,
|
||||||
|
rotate_pages_threshold=None,
|
||||||
|
max_image_pixels=None,
|
||||||
|
color_conversion_strategy=None,
|
||||||
|
user_args=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tesseract_parser(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
null_app_config: MagicMock,
|
||||||
|
) -> Generator[RasterisedDocumentParser, None, None]:
|
||||||
|
"""Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Patches the config system to avoid database access.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
RasterisedDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
mocker.patch(
|
||||||
|
"paperless.config.BaseConfig._get_config_instance",
|
||||||
|
return_value=null_app_config,
|
||||||
|
)
|
||||||
|
with RasterisedDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def make_tesseract_parser(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
null_app_config: MagicMock,
|
||||||
|
) -> MakeTesseractParser:
|
||||||
|
"""Return a factory for creating RasterisedDocumentParser with Django settings overrides.
|
||||||
|
|
||||||
|
This fixture is useful for tests that need to create parsers with different
|
||||||
|
settings configurations.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Callable[..., contextmanager[RasterisedDocumentParser]]
|
||||||
|
A context manager factory that accepts Django settings overrides
|
||||||
|
"""
|
||||||
|
mocker.patch(
|
||||||
|
"paperless.config.BaseConfig._get_config_instance",
|
||||||
|
return_value=null_app_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _make_parser(**django_settings_overrides):
|
||||||
|
with override_settings(**django_settings_overrides):
|
||||||
|
with RasterisedDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
return _make_parser
|
||||||
|
|||||||
@@ -12,7 +12,64 @@ from pytest_httpx import HTTPXMock
|
|||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless.parsers import ParserContext
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestMailParserProtocol:
|
||||||
|
"""Verify that MailDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_isinstance_satisfies_protocol(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(mail_parser, ParserProtocol)
|
||||||
|
|
||||||
|
def test_supported_mime_types(self) -> None:
|
||||||
|
mime_types = MailDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
assert "message/rfc822" in mime_types
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("mime_type", "expected"),
|
||||||
|
[
|
||||||
|
("message/rfc822", 10),
|
||||||
|
("application/pdf", None),
|
||||||
|
("text/plain", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score(self, mime_type: str, expected: int | None) -> None:
|
||||||
|
assert MailDocumentParser.score(mime_type, "email.eml") == expected
|
||||||
|
|
||||||
|
def test_can_produce_archive_is_false(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert mail_parser.can_produce_archive is False
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_true(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert mail_parser.requires_pdf_rendition is True
|
||||||
|
|
||||||
|
def test_get_page_count_returns_none_without_archive(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
html_email_file: Path,
|
||||||
|
) -> None:
|
||||||
|
assert mail_parser.get_page_count(html_email_file, "message/rfc822") is None
|
||||||
|
|
||||||
|
def test_get_page_count_returns_int_with_pdf_archive(
|
||||||
|
self,
|
||||||
|
mail_parser: MailDocumentParser,
|
||||||
|
simple_txt_email_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
mail_parser._archive_path = simple_txt_email_pdf_file
|
||||||
|
count = mail_parser.get_page_count(simple_txt_email_pdf_file, "message/rfc822")
|
||||||
|
assert isinstance(count, int)
|
||||||
|
assert count > 0
|
||||||
|
|
||||||
|
|
||||||
class TestEmailFileParsing:
|
class TestEmailFileParsing:
|
||||||
@@ -24,7 +81,7 @@ class TestEmailFileParsing:
|
|||||||
def test_parse_error_missing_file(
|
def test_parse_error_missing_file(
|
||||||
self,
|
self,
|
||||||
mail_parser: MailDocumentParser,
|
mail_parser: MailDocumentParser,
|
||||||
sample_dir: Path,
|
mail_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -35,7 +92,7 @@ class TestEmailFileParsing:
|
|||||||
- An Exception is thrown
|
- An Exception is thrown
|
||||||
"""
|
"""
|
||||||
# Check if exception is raised when parsing fails.
|
# Check if exception is raised when parsing fails.
|
||||||
test_file = sample_dir / "doesntexist.eml"
|
test_file = mail_samples_dir / "doesntexist.eml"
|
||||||
|
|
||||||
assert not test_file.exists()
|
assert not test_file.exists()
|
||||||
|
|
||||||
@@ -246,12 +303,12 @@ class TestEmailThumbnailGenerate:
|
|||||||
"""
|
"""
|
||||||
mocked_return = "Passing the return value through.."
|
mocked_return = "Passing the return value through.."
|
||||||
mock_make_thumbnail_from_pdf = mocker.patch(
|
mock_make_thumbnail_from_pdf = mocker.patch(
|
||||||
"paperless_mail.parsers.make_thumbnail_from_pdf",
|
"paperless.parsers.mail.make_thumbnail_from_pdf",
|
||||||
)
|
)
|
||||||
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
||||||
|
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
mock_generate_pdf.return_value = "Mocked return value.."
|
mock_generate_pdf.return_value = "Mocked return value.."
|
||||||
|
|
||||||
@@ -260,8 +317,7 @@ class TestEmailThumbnailGenerate:
|
|||||||
mock_generate_pdf.assert_called_once()
|
mock_generate_pdf.assert_called_once()
|
||||||
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
||||||
"Mocked return value..",
|
"Mocked return value..",
|
||||||
mail_parser.tempdir,
|
mail_parser._tempdir,
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert mocked_return == thumb
|
assert mocked_return == thumb
|
||||||
@@ -373,7 +429,7 @@ class TestParser:
|
|||||||
"""
|
"""
|
||||||
# Validate parsing returns the expected results
|
# Validate parsing returns the expected results
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
|
|
||||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||||
@@ -385,7 +441,7 @@ class TestParser:
|
|||||||
"BCC: fdf@fvf.de\n\n"
|
"BCC: fdf@fvf.de\n\n"
|
||||||
"\n\nThis is just a simple Text Mail."
|
"\n\nThis is just a simple Text Mail."
|
||||||
)
|
)
|
||||||
assert text_expected == mail_parser.text
|
assert text_expected == mail_parser.get_text()
|
||||||
assert (
|
assert (
|
||||||
datetime.datetime(
|
datetime.datetime(
|
||||||
2022,
|
2022,
|
||||||
@@ -396,7 +452,7 @@ class TestParser:
|
|||||||
43,
|
43,
|
||||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||||
)
|
)
|
||||||
== mail_parser.date
|
== mail_parser.get_date()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
||||||
@@ -419,7 +475,7 @@ class TestParser:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate parsing returns the expected results
|
# Validate parsing returns the expected results
|
||||||
@@ -443,7 +499,7 @@ class TestParser:
|
|||||||
mail_parser.parse(html_email_file, "message/rfc822")
|
mail_parser.parse(html_email_file, "message/rfc822")
|
||||||
|
|
||||||
mock_generate_pdf.assert_called_once()
|
mock_generate_pdf.assert_called_once()
|
||||||
assert text_expected == mail_parser.text
|
assert text_expected == mail_parser.get_text()
|
||||||
assert (
|
assert (
|
||||||
datetime.datetime(
|
datetime.datetime(
|
||||||
2022,
|
2022,
|
||||||
@@ -454,7 +510,7 @@ class TestParser:
|
|||||||
19,
|
19,
|
||||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||||
)
|
)
|
||||||
== mail_parser.date
|
== mail_parser.get_date()
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_generate_pdf_parse_error(
|
def test_generate_pdf_parse_error(
|
||||||
@@ -501,7 +557,7 @@ class TestParser:
|
|||||||
|
|
||||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||||
|
|
||||||
assert mail_parser.archive_path is not None
|
assert mail_parser.get_archive_path() is not None
|
||||||
|
|
||||||
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
||||||
def test_generate_pdf_html_email(
|
def test_generate_pdf_html_email(
|
||||||
@@ -542,7 +598,7 @@ class TestParser:
|
|||||||
)
|
)
|
||||||
mail_parser.parse(html_email_file, "message/rfc822")
|
mail_parser.parse(html_email_file, "message/rfc822")
|
||||||
|
|
||||||
assert mail_parser.archive_path is not None
|
assert mail_parser.get_archive_path() is not None
|
||||||
|
|
||||||
def test_generate_pdf_html_email_html_to_pdf_failure(
|
def test_generate_pdf_html_email_html_to_pdf_failure(
|
||||||
self,
|
self,
|
||||||
@@ -712,10 +768,10 @@ class TestParser:
|
|||||||
|
|
||||||
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
||||||
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
||||||
|
mail_parser.configure(ParserContext(mailrule_id=1))
|
||||||
mail_parser.parse(
|
mail_parser.parse(
|
||||||
document_path=html_email_file,
|
document_path=html_email_file,
|
||||||
mime_type="message/rfc822",
|
mime_type="message/rfc822",
|
||||||
mailrule_id=1,
|
|
||||||
)
|
)
|
||||||
args, _ = mock_merge_route.call_args
|
args, _ = mock_merge_route.call_args
|
||||||
assert len(args[0]) == expected_calls
|
assert len(args[0]) == expected_calls
|
||||||
@@ -11,7 +11,7 @@ from PIL import Image
|
|||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.tests.utils import util_call_with_backoff
|
from documents.tests.utils import util_call_with_backoff
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
def extract_text(pdf_path: Path) -> str:
|
def extract_text(pdf_path: Path) -> str:
|
||||||
@@ -159,7 +159,7 @@ class TestParserLive:
|
|||||||
- The returned thumbnail image file shall match the expected hash
|
- The returned thumbnail image file shall match the expected hash
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
||||||
|
|
||||||
@@ -216,10 +216,10 @@ class TestParserLive:
|
|||||||
- The merged PDF shall contain text from both source PDFs
|
- The merged PDF shall contain text from both source PDFs
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf_from_html = mocker.patch(
|
mock_generate_pdf_from_html = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
|
||||||
)
|
)
|
||||||
mock_generate_pdf_from_mail = mocker.patch(
|
mock_generate_pdf_from_mail = mocker.patch(
|
||||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
|
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
|
||||||
)
|
)
|
||||||
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
||||||
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
||||||
503
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,503 @@
|
|||||||
|
"""
|
||||||
|
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||||
|
|
||||||
|
All tests use the context-manager protocol for parser lifecycle.
|
||||||
|
|
||||||
|
Fixture layout
|
||||||
|
--------------
|
||||||
|
make_azure_mock — factory (defined here; specific to this module)
|
||||||
|
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||||
|
use when a test needs the client to succeed
|
||||||
|
failing_azure_client
|
||||||
|
— composes azure_settings + patch with RuntimeError;
|
||||||
|
use when a test needs the client to fail
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-local fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||||
|
_DEFAULT_TEXT = "Extracted text."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def make_azure_mock() -> Callable[[str], Mock]:
|
||||||
|
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
mock_client = make_azure_mock() # default extracted text
|
||||||
|
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_poller = Mock()
|
||||||
|
mock_poller.wait.return_value = None
|
||||||
|
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||||
|
mock_poller.result.return_value.content = text
|
||||||
|
mock_client.begin_analyze_document.return_value = mock_poller
|
||||||
|
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
return _factory
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
make_azure_mock: Callable[[str], Mock],
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||||
|
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||||
|
"""
|
||||||
|
mock_client = make_azure_mock()
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def failing_azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||||
|
tests can assert on calls such as ``close()``.
|
||||||
|
"""
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Protocol contract
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProtocol:
|
||||||
|
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_isinstance_satisfies_protocol(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(remote_parser, ParserProtocol)
|
||||||
|
|
||||||
|
def test_class_attributes_present(self) -> None:
|
||||||
|
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.version, str)
|
||||||
|
and RemoteDocumentParser.version
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||||
|
)
|
||||||
|
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# supported_mime_types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserSupportedMimeTypes:
|
||||||
|
"""supported_mime_types() always returns the full set regardless of config."""
|
||||||
|
|
||||||
|
def test_returns_dict(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
|
||||||
|
def test_includes_all_expected_types(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
expected = {
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
}
|
||||||
|
assert expected == set(mime_types.keys())
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_returns_full_set_when_not_configured(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: No remote engine is configured
|
||||||
|
WHEN: supported_mime_types() is called
|
||||||
|
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||||
|
"""
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert len(mime_types) == 7
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# score()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserScore:
|
||||||
|
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
pytest.param("image/tiff", id="tiff"),
|
||||||
|
pytest.param("image/bmp", id="bmp"),
|
||||||
|
pytest.param("image/gif", id="gif"),
|
||||||
|
pytest.param("image/webp", id="webp"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result == 20
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_api_key_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_endpoint_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||||
|
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_higher_than_tesseract_default(self) -> None:
|
||||||
|
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||||
|
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert score is not None and score > 10
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProperties:
|
||||||
|
def test_can_produce_archive_is_true(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.can_produce_archive is True
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_false(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.requires_pdf_rendition is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserLifecycle:
|
||||||
|
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
assert tempdir.exists()
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||||
|
tempdir: Path | None = None
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
assert tempdir is not None
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — happy path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParse:
|
||||||
|
def test_parse_returns_text_from_azure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||||
|
|
||||||
|
def test_parse_sets_archive_path(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
archive = remote_parser.get_archive_path()
|
||||||
|
assert archive is not None
|
||||||
|
assert archive.exists()
|
||||||
|
assert archive.suffix == ".pdf"
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_success(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.configure(ParserContext())
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_parse_sets_empty_text_when_not_configured(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == ""
|
||||||
|
assert remote_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_get_text_none_before_parse(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_get_date_always_none(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_date() is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — Azure failure path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParseError:
|
||||||
|
def test_parse_returns_none_on_azure_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
failing_azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
def test_parse_logs_error_on_azure_failure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||||
|
|
||||||
|
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
mock_log.error.assert_called_once()
|
||||||
|
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# get_page_count()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserPageCount:
|
||||||
|
def test_page_count_for_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(count, int)
|
||||||
|
assert count >= 1
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_image_mime(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(simple_digital_pdf_file, "image/png")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_metadata()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserMetadata:
|
||||||
|
def test_extract_metadata_non_pdf_returns_empty(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(simple_digital_pdf_file, "image/png")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_returns_list(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(
|
||||||
|
simple_digital_pdf_file,
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(
|
||||||
|
simple_digital_pdf_file,
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
for entry in result:
|
||||||
|
assert "namespace" in entry
|
||||||
|
assert "prefix" in entry
|
||||||
|
assert "key" in entry
|
||||||
|
assert "value" in entry
|
||||||
|
assert isinstance(entry["value"], str)
|
||||||
|
|
||||||
|
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Registry integration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserRegistry:
|
||||||
|
def test_registered_in_defaults(self) -> None:
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
|
||||||
|
assert RemoteDocumentParser in registry._builtins
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is RemoteDocumentParser
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
"""With remote off and a truly unsupported MIME type, registry returns None."""
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
parser_cls = registry.get_parser_for_file(
|
||||||
|
"application/x-unknown-format",
|
||||||
|
"doc.xyz",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert parser_cls is None
|
||||||
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
|
|||||||
from paperless.models import ColorConvertChoices
|
from paperless.models import ColorConvertChoices
|
||||||
from paperless.models import ModeChoices
|
from paperless.models import ModeChoices
|
||||||
from paperless.models import OutputTypeChoices
|
from paperless.models import OutputTypeChoices
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||||
1174
src/paperless/tests/parsers/test_tesseract_parser.py
Normal file
@@ -12,6 +12,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers import ParserProtocol
|
from paperless.parsers import ParserProtocol
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
@@ -93,6 +94,7 @@ class TestTextParserParse:
|
|||||||
text_parser: TextDocumentParser,
|
text_parser: TextDocumentParser,
|
||||||
sample_txt_file: Path,
|
sample_txt_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
text_parser.configure(ParserContext())
|
||||||
text_parser.parse(sample_txt_file, "text/plain")
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
assert text_parser.get_text() == "This is a test file.\n"
|
assert text_parser.get_text() == "This is a test file.\n"
|
||||||
@@ -102,6 +104,7 @@ class TestTextParserParse:
|
|||||||
text_parser: TextDocumentParser,
|
text_parser: TextDocumentParser,
|
||||||
sample_txt_file: Path,
|
sample_txt_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
text_parser.configure(ParserContext())
|
||||||
text_parser.parse(sample_txt_file, "text/plain")
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
assert text_parser.get_archive_path() is None
|
assert text_parser.get_archive_path() is None
|
||||||
@@ -111,6 +114,7 @@ class TestTextParserParse:
|
|||||||
text_parser: TextDocumentParser,
|
text_parser: TextDocumentParser,
|
||||||
sample_txt_file: Path,
|
sample_txt_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
text_parser.configure(ParserContext())
|
||||||
text_parser.parse(sample_txt_file, "text/plain")
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
assert text_parser.get_date() is None
|
assert text_parser.get_date() is None
|
||||||
@@ -129,6 +133,7 @@ class TestTextParserParse:
|
|||||||
- Parsing succeeds
|
- Parsing succeeds
|
||||||
- Invalid bytes are replaced with the Unicode replacement character
|
- Invalid bytes are replaced with the Unicode replacement character
|
||||||
"""
|
"""
|
||||||
|
text_parser.configure(ParserContext())
|
||||||
text_parser.parse(malformed_txt_file, "text/plain")
|
text_parser.parse(malformed_txt_file, "text/plain")
|
||||||
|
|
||||||
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||||
@@ -251,6 +256,9 @@ class TestTextParserRegistry:
|
|||||||
from paperless.parsers.registry import get_parser_registry
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
registry = get_parser_registry()
|
registry = get_parser_registry()
|
||||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
parser_cls = registry.get_parser_for_file(
|
||||||
|
"application/x-unknown-format",
|
||||||
|
"doc.xyz",
|
||||||
|
)
|
||||||
|
|
||||||
assert parser_cls is None
|
assert parser_cls is None
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from documents.tests.utils import util_call_with_backoff
|
from documents.tests.utils import util_call_with_backoff
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
tika_parser.text
|
tika_parser.get_text()
|
||||||
== "This is an ODT test document, created September 14, 2022"
|
== "This is an ODT test document, created September 14, 2022"
|
||||||
)
|
)
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
|
assert archive is not None
|
||||||
|
assert b"PDF-" in archive.read_bytes()[:10]
|
||||||
|
|
||||||
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
||||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
def test_basic_parse_docx(
|
def test_basic_parse_docx(
|
||||||
self,
|
self,
|
||||||
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
tika_parser.text
|
tika_parser.get_text()
|
||||||
== "This is an DOCX test document, also made September 14, 2022"
|
== "This is an DOCX test document, also made September 14, 2022"
|
||||||
)
|
)
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
assert archive is not None
|
||||||
|
with archive.open("rb") as f:
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
|
|
||||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
def test_basic_parse_doc(
|
def test_basic_parse_doc(
|
||||||
self,
|
self,
|
||||||
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
|
|||||||
[sample_doc_file, "application/msword"],
|
[sample_doc_file, "application/msword"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert tika_parser.text is not None
|
text = tika_parser.get_text()
|
||||||
assert (
|
assert text is not None
|
||||||
"This is a test document, saved in the older .doc format"
|
assert "This is a test document, saved in the older .doc format" in text
|
||||||
in tika_parser.text
|
archive = tika_parser.get_archive_path()
|
||||||
)
|
assert archive is not None
|
||||||
assert tika_parser.archive_path is not None
|
with archive.open("rb") as f:
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
|
|
||||||
def test_tika_fails_multi_part(
|
def test_tika_fails_multi_part(
|
||||||
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
|
|||||||
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
assert archive is not None
|
||||||
|
with archive.open("rb") as f:
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
@@ -9,7 +9,80 @@ from pytest_django.fixtures import SettingsWrapper
|
|||||||
from pytest_httpx import HTTPXMock
|
from pytest_httpx import HTTPXMock
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
from paperless.parsers import ParserContext
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestTikaParserRegistryInterface:
|
||||||
|
"""Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_satisfies_parser_protocol(self) -> None:
|
||||||
|
assert isinstance(TikaDocumentParser(), ParserProtocol)
|
||||||
|
|
||||||
|
def test_supported_mime_types_is_classmethod(self) -> None:
|
||||||
|
mime_types = TikaDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
assert len(mime_types) > 0
|
||||||
|
|
||||||
|
def test_score_returns_none_when_tika_disabled(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = False
|
||||||
|
result = TikaDocumentParser.score(
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
"sample.odt",
|
||||||
|
)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_int_when_tika_enabled(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = True
|
||||||
|
result = TikaDocumentParser.score(
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
"sample.odt",
|
||||||
|
)
|
||||||
|
assert isinstance(result, int)
|
||||||
|
|
||||||
|
def test_score_returns_none_for_unsupported_mime(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = True
|
||||||
|
result = TikaDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_can_produce_archive_is_false(self) -> None:
|
||||||
|
assert TikaDocumentParser().can_produce_archive is False
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_true(self) -> None:
|
||||||
|
assert TikaDocumentParser().requires_pdf_rendition is True
|
||||||
|
|
||||||
|
def test_get_page_count_returns_none_without_archive(
|
||||||
|
self,
|
||||||
|
tika_parser: TikaDocumentParser,
|
||||||
|
sample_odt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
assert (
|
||||||
|
tika_parser.get_page_count(
|
||||||
|
sample_odt_file,
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
)
|
||||||
|
is None
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_get_page_count_returns_int_with_pdf_archive(
|
||||||
|
self,
|
||||||
|
tika_parser: TikaDocumentParser,
|
||||||
|
simple_digital_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
tika_parser._archive_path = simple_digital_pdf_file
|
||||||
|
count = tika_parser.get_page_count(simple_digital_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(count, int)
|
||||||
|
assert count > 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db()
|
@pytest.mark.django_db()
|
||||||
@@ -34,14 +107,15 @@ class TestTikaParser:
|
|||||||
# Pretend convert to PDF response
|
# Pretend convert to PDF response
|
||||||
httpx_mock.add_response(content=b"PDF document")
|
httpx_mock.add_response(content=b"PDF document")
|
||||||
|
|
||||||
|
tika_parser.configure(ParserContext())
|
||||||
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
||||||
|
|
||||||
assert tika_parser.text == "the content"
|
assert tika_parser.get_text() == "the content"
|
||||||
assert tika_parser.archive_path is not None
|
assert tika_parser.get_archive_path() is not None
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
with Path(tika_parser.get_archive_path()).open("rb") as f:
|
||||||
assert f.read() == b"PDF document"
|
assert f.read() == b"PDF document"
|
||||||
|
|
||||||
assert tika_parser.date == datetime.datetime(
|
assert tika_parser.get_date() == datetime.datetime(
|
||||||
2020,
|
2020,
|
||||||
11,
|
11,
|
||||||
21,
|
21,
|
||||||
@@ -89,7 +163,7 @@ class TestTikaParser:
|
|||||||
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||||
|
|
||||||
with pytest.raises(ParseError):
|
with pytest.raises(ParseError):
|
||||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
tika_parser._convert_to_pdf(sample_odt_file)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("setting_value", "expected_form_value"),
|
("setting_value", "expected_form_value"),
|
||||||
@@ -106,7 +180,6 @@ class TestTikaParser:
|
|||||||
expected_form_value: str,
|
expected_form_value: str,
|
||||||
httpx_mock: HTTPXMock,
|
httpx_mock: HTTPXMock,
|
||||||
settings: SettingsWrapper,
|
settings: SettingsWrapper,
|
||||||
tika_parser: TikaDocumentParser,
|
|
||||||
sample_odt_file: Path,
|
sample_odt_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -117,6 +190,8 @@ class TestTikaParser:
|
|||||||
THEN:
|
THEN:
|
||||||
- Request to Gotenberg contains the expected PDF/A format string
|
- Request to Gotenberg contains the expected PDF/A format string
|
||||||
"""
|
"""
|
||||||
|
# Parser must be created after the setting is changed so that
|
||||||
|
# OutputTypeConfig reads the correct value at __init__ time.
|
||||||
settings.OCR_OUTPUT_TYPE = setting_value
|
settings.OCR_OUTPUT_TYPE = setting_value
|
||||||
httpx_mock.add_response(
|
httpx_mock.add_response(
|
||||||
status_code=codes.OK,
|
status_code=codes.OK,
|
||||||
@@ -124,7 +199,8 @@ class TestTikaParser:
|
|||||||
method="POST",
|
method="POST",
|
||||||
)
|
)
|
||||||
|
|
||||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
with TikaDocumentParser() as parser:
|
||||||
|
parser._convert_to_pdf(sample_odt_file)
|
||||||
|
|
||||||
request = httpx_mock.get_request()
|
request = httpx_mock.get_request()
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 6.0 KiB After Width: | Height: | Size: 6.0 KiB |
|
Before Width: | Height: | Size: 2.8 KiB After Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
|
Before Width: | Height: | Size: 5.7 KiB After Width: | Height: | Size: 5.7 KiB |