mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-29 20:32:44 +00:00
Compare commits
4 Commits
feature-ar
...
chore/plug
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d0d243057 | ||
|
|
7a192d021f | ||
|
|
1e30490a46 | ||
|
|
bd9e529a63 |
3
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
3
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@@ -21,6 +21,7 @@ body:
|
|||||||
- [The installation instructions](https://docs.paperless-ngx.com/setup/#installation).
|
- [The installation instructions](https://docs.paperless-ngx.com/setup/#installation).
|
||||||
- [Existing issues and discussions](https://github.com/paperless-ngx/paperless-ngx/search?q=&type=issues).
|
- [Existing issues and discussions](https://github.com/paperless-ngx/paperless-ngx/search?q=&type=issues).
|
||||||
- Disable any custom container initialization scripts, if using
|
- Disable any custom container initialization scripts, if using
|
||||||
|
- Remove any third-party parser plugins — issues caused by or requiring changes to a third-party plugin will be closed without investigation.
|
||||||
|
|
||||||
If you encounter issues while installing or configuring Paperless-ngx, please post in the ["Support" section of the discussions](https://github.com/paperless-ngx/paperless-ngx/discussions/new?category=support).
|
If you encounter issues while installing or configuring Paperless-ngx, please post in the ["Support" section of the discussions](https://github.com/paperless-ngx/paperless-ngx/discussions/new?category=support).
|
||||||
- type: textarea
|
- type: textarea
|
||||||
@@ -120,5 +121,7 @@ body:
|
|||||||
required: true
|
required: true
|
||||||
- label: I have already searched for relevant existing issues and discussions before opening this report.
|
- label: I have already searched for relevant existing issues and discussions before opening this report.
|
||||||
required: true
|
required: true
|
||||||
|
- label: I have reproduced this issue with all third-party parser plugins removed. I understand that issues caused by third-party plugins will be closed without investigation.
|
||||||
|
required: true
|
||||||
- label: I have updated the title field above with a concise description.
|
- label: I have updated the title field above with a concise description.
|
||||||
required: true
|
required: true
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -111,4 +111,3 @@ celerybeat-schedule*
|
|||||||
|
|
||||||
# ignore pnpm package store folder created when setting up the devcontainer
|
# ignore pnpm package store folder created when setting up the devcontainer
|
||||||
.pnpm-store/
|
.pnpm-store/
|
||||||
.worktrees
|
|
||||||
|
|||||||
@@ -723,6 +723,81 @@ services:
|
|||||||
|
|
||||||
1. Note the `:ro` tag means the folder will be mounted as read only. This is for extra security against changes
|
1. Note the `:ro` tag means the folder will be mounted as read only. This is for extra security against changes
|
||||||
|
|
||||||
|
## Installing third-party parser plugins {#parser-plugins}
|
||||||
|
|
||||||
|
Third-party parser plugins extend Paperless-ngx to support additional file
|
||||||
|
formats. A plugin is a Python package that advertises itself under the
|
||||||
|
`paperless_ngx.parsers` entry point group. Refer to the
|
||||||
|
[developer documentation](development.md#making-custom-parsers) for how to
|
||||||
|
create one.
|
||||||
|
|
||||||
|
!!! warning "Third-party plugins are not officially supported"
|
||||||
|
|
||||||
|
The Paperless-ngx maintainers do not provide support for third-party
|
||||||
|
plugins. Issues caused by or requiring changes to a third-party plugin
|
||||||
|
will be closed without further investigation. Always reproduce problems
|
||||||
|
with all plugins removed before filing a bug report.
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
Use a [custom container initialization script](#custom-container-initialization)
|
||||||
|
to install the package before the webserver starts. Create a shell script and
|
||||||
|
mount it into `/custom-cont-init.d`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# /path/to/my/scripts/install-parsers.sh
|
||||||
|
|
||||||
|
pip install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
Mount it in your `docker-compose.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
webserver:
|
||||||
|
# ...
|
||||||
|
volumes:
|
||||||
|
- /path/to/my/scripts:/custom-cont-init.d:ro
|
||||||
|
```
|
||||||
|
|
||||||
|
The script runs as `root` before the webserver starts, so the package will be
|
||||||
|
available when Paperless-ngx discovers plugins at startup.
|
||||||
|
|
||||||
|
### Bare metal
|
||||||
|
|
||||||
|
Install the package into the same Python environment that runs Paperless-ngx.
|
||||||
|
If you followed the standard bare-metal install guide, that is the `paperless`
|
||||||
|
user's environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo -Hu paperless pip3 install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using `uv` or a virtual environment, activate it first and then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install my-paperless-parser-package
|
||||||
|
# or
|
||||||
|
pip install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
Restart all Paperless-ngx services after installation so the new plugin is
|
||||||
|
discovered.
|
||||||
|
|
||||||
|
### Verifying installation
|
||||||
|
|
||||||
|
On the next startup, check the application logs for a line confirming
|
||||||
|
discovery:
|
||||||
|
|
||||||
|
```
|
||||||
|
Loaded third-party parser 'My Parser' v1.0.0 by Acme Corp (entrypoint: 'my_parser').
|
||||||
|
```
|
||||||
|
|
||||||
|
If this line does not appear, verify that the package is installed in the
|
||||||
|
correct environment and that its `pyproject.toml` declares the
|
||||||
|
`paperless_ngx.parsers` entry point.
|
||||||
|
|
||||||
## MySQL Caveats {#mysql-caveats}
|
## MySQL Caveats {#mysql-caveats}
|
||||||
|
|
||||||
### Case Sensitivity
|
### Case Sensitivity
|
||||||
|
|||||||
@@ -801,14 +801,11 @@ parsing documents.
|
|||||||
|
|
||||||
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
||||||
|
|
||||||
: Tell paperless when and how to perform ocr on your documents. Four
|
: Tell paperless when and how to perform ocr on your documents. Three
|
||||||
modes are available:
|
modes are available:
|
||||||
|
|
||||||
- `auto` (default): Paperless detects whether a document already
|
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||||
has embedded text via pdftotext. If sufficient text is found,
|
pages where no text is present. This is the safest option.
|
||||||
OCR is skipped for that document (`--skip-text`). If no text is
|
|
||||||
present, OCR runs normally. This is the safest option for mixed
|
|
||||||
document collections.
|
|
||||||
|
|
||||||
- `redo`: Paperless will OCR all pages of your documents and
|
- `redo`: Paperless will OCR all pages of your documents and
|
||||||
attempt to replace any existing text layers with new text. This
|
attempt to replace any existing text layers with new text. This
|
||||||
@@ -826,59 +823,24 @@ modes are available:
|
|||||||
significantly larger and text won't appear as sharp when zoomed
|
significantly larger and text won't appear as sharp when zoomed
|
||||||
in.
|
in.
|
||||||
|
|
||||||
- `off`: Paperless never invokes the OCR engine. For PDFs, text
|
The default is `skip`, which only performs OCR when necessary and
|
||||||
is extracted via pdftotext only. For image documents, text will
|
always creates archived documents.
|
||||||
be empty. Archive file generation still works via format
|
|
||||||
conversion (no Tesseract or Ghostscript required).
|
|
||||||
|
|
||||||
The default is `auto`.
|
Read more about this in the [OCRmyPDF
|
||||||
|
|
||||||
For the `skip`, `redo`, and `force` modes, read more about OCR
|
|
||||||
behaviour in the [OCRmyPDF
|
|
||||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||||
|
|
||||||
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
|
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
|
||||||
|
|
||||||
: Controls when paperless creates a PDF/A archive version of your
|
: Specify when you would like paperless to skip creating an archived
|
||||||
documents. Archive files are stored alongside the original and are used
|
version of your documents. This is useful if you don't want to have two
|
||||||
for display in the web interface.
|
almost-identical versions of your documents in the media folder.
|
||||||
|
|
||||||
- `auto` (default): Produce archives for scanned or image-based
|
- `never`: Never skip creating an archived version.
|
||||||
documents. Skip archive generation for born-digital PDFs that
|
- `with_text`: Skip creating an archived version for documents
|
||||||
already contain embedded text. This is the recommended setting
|
that already have embedded text.
|
||||||
for mixed document collections.
|
- `always`: Always skip creating an archived version.
|
||||||
- `always`: Always produce a PDF/A archive when the parser
|
|
||||||
supports it, regardless of whether the document already has
|
|
||||||
text.
|
|
||||||
- `never`: Never produce an archive. Only the original file is
|
|
||||||
stored. Saves disk space but the web viewer will display the
|
|
||||||
original file directly.
|
|
||||||
|
|
||||||
**Behaviour by file type and mode** (`auto` column shows the default):
|
The default is `never`.
|
||||||
|
|
||||||
| Document type | `never` | `auto` (default) | `always` |
|
|
||||||
| -------------------------- | ------- | -------------------------- | -------- |
|
|
||||||
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
|
|
||||||
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
|
|
||||||
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
|
|
||||||
| Plain text, email, HTML | No | No | No |
|
|
||||||
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |
|
|
||||||
|
|
||||||
\* Tika always produces a PDF rendition for display; this counts as
|
|
||||||
the archive regardless of the setting.
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
|
|
||||||
This setting applies to the built-in Tesseract parser. Parsers
|
|
||||||
that must always convert documents to PDF for display (e.g. DOCX,
|
|
||||||
ODT via Tika) will produce a PDF regardless of this setting.
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
|
|
||||||
The **remote OCR parser** (Azure AI) always produces a searchable
|
|
||||||
PDF and stores it as the archive copy, regardless of this setting.
|
|
||||||
`ARCHIVE_FILE_GENERATION=never` has no effect when the remote
|
|
||||||
parser handles a document.
|
|
||||||
|
|
||||||
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
||||||
|
|
||||||
|
|||||||
@@ -370,121 +370,363 @@ docker build --file Dockerfile --tag paperless:local .
|
|||||||
|
|
||||||
## Extending Paperless-ngx
|
## Extending Paperless-ngx
|
||||||
|
|
||||||
Paperless-ngx does not have any fancy plugin systems and will probably never
|
Paperless-ngx supports third-party document parsers via a Python entry point
|
||||||
have. However, some parts of the application have been designed to allow
|
plugin system. Plugins are distributed as ordinary Python packages and
|
||||||
easy integration of additional features without any modification to the
|
discovered automatically at startup — no changes to the Paperless-ngx source
|
||||||
base code.
|
are required.
|
||||||
|
|
||||||
|
!!! warning "Third-party plugins are not officially supported"
|
||||||
|
|
||||||
|
The Paperless-ngx maintainers do not provide support for third-party
|
||||||
|
plugins. Issues that are caused by or require changes to a third-party
|
||||||
|
plugin will be closed without further investigation. If you believe you
|
||||||
|
have found a bug in Paperless-ngx itself (not in a plugin), please
|
||||||
|
reproduce it with all third-party plugins removed before filing an issue.
|
||||||
|
|
||||||
### Making custom parsers
|
### Making custom parsers
|
||||||
|
|
||||||
Paperless-ngx uses parsers to add documents. A parser is
|
Paperless-ngx uses parsers to add documents. A parser is responsible for:
|
||||||
responsible for:
|
|
||||||
|
|
||||||
- Retrieving the content from the original
|
- Extracting plain-text content from the document
|
||||||
- Creating a thumbnail
|
- Generating a thumbnail image
|
||||||
- _optional:_ Retrieving a created date from the original
|
- _optional:_ Detecting the document's creation date
|
||||||
- _optional:_ Creating an archived document from the original
|
- _optional:_ Producing a searchable PDF archive copy
|
||||||
|
|
||||||
Custom parsers can be added to Paperless-ngx to support more file types. In
|
Custom parsers are distributed as ordinary Python packages and registered
|
||||||
order to do that, you need to write the parser itself and announce its
|
via a [setuptools entry point](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||||
existence to Paperless-ngx.
|
No changes to the Paperless-ngx source are required.
|
||||||
|
|
||||||
The parser itself must extend `documents.parsers.DocumentParser` and
|
#### 1. Implementing the parser class
|
||||||
must implement the methods `parse` and `get_thumbnail`. You can provide
|
|
||||||
your own implementation to `get_date` if you don't want to rely on
|
Your parser must satisfy the `ParserProtocol` structural interface defined in
|
||||||
Paperless-ngx' default date guessing mechanisms.
|
`paperless.parsers`. The simplest approach is to write a plain class — no base
|
||||||
|
class is required, only the right attributes and methods.
|
||||||
|
|
||||||
|
**Class-level identity attributes**
|
||||||
|
|
||||||
|
The registry reads these before instantiating the parser, so they must be
|
||||||
|
plain class attributes (not instance attributes or properties):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class MyCustomParser(DocumentParser):
|
class MyCustomParser:
|
||||||
|
name = "My Format Parser" # human-readable name shown in logs
|
||||||
def parse(self, document_path, mime_type):
|
version = "1.0.0" # semantic version string
|
||||||
# This method does not return anything. Rather, you should assign
|
author = "Acme Corp" # author / organisation
|
||||||
# whatever you got from the document to the following fields:
|
url = "https://example.com/my-parser" # docs or issue tracker
|
||||||
|
|
||||||
# The content of the document.
|
|
||||||
self.text = "content"
|
|
||||||
|
|
||||||
# Optional: path to a PDF document that you created from the original.
|
|
||||||
self.archive_path = os.path.join(self.tempdir, "archived.pdf")
|
|
||||||
|
|
||||||
# Optional: "created" date of the document.
|
|
||||||
self.date = get_created_from_metadata(document_path)
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type):
|
|
||||||
# This should return the path to a thumbnail you created for this
|
|
||||||
# document.
|
|
||||||
return os.path.join(self.tempdir, "thumb.webp")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If you encounter any issues during parsing, raise a
|
**Declaring supported MIME types**
|
||||||
`documents.parsers.ParseError`.
|
|
||||||
|
|
||||||
The `self.tempdir` directory is a temporary directory that is guaranteed
|
Return a `dict` mapping MIME type strings to preferred file extensions
|
||||||
to be empty and removed after consumption finished. You can use that
|
(including the leading dot). Paperless-ngx uses the extension when storing
|
||||||
directory to store any intermediate files and also use it to store the
|
archive copies and serving files for download.
|
||||||
thumbnail / archived document.
|
|
||||||
|
|
||||||
After that, you need to announce your parser to Paperless-ngx. You need to
|
|
||||||
connect a handler to the `document_consumer_declaration` signal. Have a
|
|
||||||
look in the file `src/paperless_tesseract/apps.py` on how that's done.
|
|
||||||
The handler is a method that returns information about your parser:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def myparser_consumer_declaration(sender, **kwargs):
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"parser": MyCustomParser,
|
"application/x-my-format": ".myf",
|
||||||
"weight": 0,
|
"application/x-my-format-alt": ".myf",
|
||||||
"mime_types": {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- `parser` is a reference to a class that extends `DocumentParser`.
|
**Scoring**
|
||||||
- `weight` is used whenever two or more parsers are able to parse a
|
|
||||||
file: The parser with the higher weight wins. This can be used to
|
|
||||||
override the parsers provided by Paperless-ngx.
|
|
||||||
- `mime_types` is a dictionary. The keys are the mime types your
|
|
||||||
parser supports and the value is the default file extension that
|
|
||||||
Paperless-ngx should use when storing files and serving them for
|
|
||||||
download. We could guess that from the file extensions, but some
|
|
||||||
mime types have many extensions associated with them and the Python
|
|
||||||
methods responsible for guessing the extension do not always return
|
|
||||||
the same value.
|
|
||||||
|
|
||||||
## Using Visual Studio Code devcontainer
|
When more than one parser can handle a file, the registry calls `score()` on
|
||||||
|
each candidate and picks the one with the highest result. Return `None` to
|
||||||
|
decline handling a file even though the MIME type is listed as supported (for
|
||||||
|
example, when a required external service is not configured).
|
||||||
|
|
||||||
Another easy way to get started with development is to use Visual Studio
|
| Score | Meaning |
|
||||||
Code devcontainers. This approach will create a preconfigured development
|
| ------ | ------------------------------------------------- |
|
||||||
environment with all of the required tools and dependencies.
|
| `None` | Decline — do not handle this file |
|
||||||
[Learn more about devcontainers](https://code.visualstudio.com/docs/devcontainers/containers).
|
| `10` | Default priority used by all built-in parsers |
|
||||||
The .devcontainer/vscode/tasks.json and .devcontainer/vscode/launch.json files
|
| `> 10` | Override a built-in parser for the same MIME type |
|
||||||
contain more information about the specific tasks and launch configurations (see the
|
|
||||||
non-standard "description" field).
|
|
||||||
|
|
||||||
To get started:
|
```python
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: "Path | None" = None,
|
||||||
|
) -> int | None:
|
||||||
|
# Inspect filename or file bytes here if needed.
|
||||||
|
return 10
|
||||||
|
```
|
||||||
|
|
||||||
1. Clone the repository on your machine and open the Paperless-ngx folder in VS Code.
|
**Archive and rendition flags**
|
||||||
|
|
||||||
2. VS Code will prompt you with "Reopen in container". Do so and wait for the environment to start.
|
```python
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""True if parse() can produce a searchable PDF archive copy."""
|
||||||
|
return True # or False if your parser doesn't produce PDFs
|
||||||
|
|
||||||
3. In case your host operating system is Windows:
|
@property
|
||||||
- The Source Control view in Visual Studio Code might show: "The detected Git repository is potentially unsafe as the folder is owned by someone other than the current user." Use "Manage Unsafe Repositories" to fix this.
|
def requires_pdf_rendition(self) -> bool:
|
||||||
- Git might have detecteded modifications for all files, because Windows is using CRLF line endings. Run `git checkout .` in the containers terminal to fix this issue.
|
"""True if the original format cannot be displayed by a browser
|
||||||
|
(e.g. DOCX, ODT) and the PDF output must always be kept."""
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
4. Initialize the project by running the task **Project Setup: Run all Init Tasks**. This
|
**Context manager — temp directory lifecycle**
|
||||||
will initialize the database tables and create a superuser. Then you can compile the front end
|
|
||||||
for production or run the frontend in debug mode.
|
|
||||||
|
|
||||||
5. The project is ready for debugging, start either run the fullstack debug or individual debug
|
Paperless-ngx always uses parsers as context managers. Create a temporary
|
||||||
processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
|
working directory in `__enter__` (or `__init__`) and remove it in `__exit__`
|
||||||
|
regardless of whether an exception occurred. Store intermediate files,
|
||||||
|
thumbnails, and archive PDFs inside this directory.
|
||||||
|
|
||||||
## Developing Date Parser Plugins
|
```python
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
class MyCustomParser:
|
||||||
|
...
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optional context — `configure()`**
|
||||||
|
|
||||||
|
The consumer calls `configure()` with a `ParserContext` after instantiation
|
||||||
|
and before `parse()`. If your parser doesn't need context, a no-op
|
||||||
|
implementation is fine:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass # override if you need context.mailrule_id, etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parsing**
|
||||||
|
|
||||||
|
`parse()` is the core method. It must not return a value; instead, store
|
||||||
|
results in instance attributes and expose them via the accessor methods below.
|
||||||
|
Raise `documents.parsers.ParseError` on any unrecoverable failure.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
self._text = extract_text_from_my_format(document_path)
|
||||||
|
except Exception as e:
|
||||||
|
raise ParseError(f"Failed to parse {document_path}: {e}") from e
|
||||||
|
|
||||||
|
if produce_archive and self.can_produce_archive:
|
||||||
|
archive = self._tempdir / "archived.pdf"
|
||||||
|
convert_to_pdf(document_path, archive)
|
||||||
|
self._archive_path = archive
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result accessors**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> "datetime.datetime | None":
|
||||||
|
# Return a datetime extracted from the document, or None to let
|
||||||
|
# Paperless-ngx use its default date-guessing logic.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return self._archive_path
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thumbnail**
|
||||||
|
|
||||||
|
`get_thumbnail()` may be called independently of `parse()`. Return the path
|
||||||
|
to a WebP image inside `self._tempdir`. The image should be roughly 500 × 700
|
||||||
|
pixels.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
thumb = self._tempdir / "thumb.webp"
|
||||||
|
render_thumbnail(document_path, thumb)
|
||||||
|
return thumb
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optional methods**
|
||||||
|
|
||||||
|
These are called by the API on demand, not during the consumption pipeline.
|
||||||
|
Implement them if your format supports the information; otherwise return
|
||||||
|
`None` / `[]`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
return count_pages(document_path)
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> "list[MetadataEntry]":
|
||||||
|
# Must never raise. Return [] if metadata cannot be read.
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
return [
|
||||||
|
MetadataEntry(
|
||||||
|
namespace="https://example.com/ns/",
|
||||||
|
prefix="ex",
|
||||||
|
key="Author",
|
||||||
|
value="Alice",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Registering via entry point
|
||||||
|
|
||||||
|
Add the following to your package's `pyproject.toml`. The key (left of `=`)
|
||||||
|
is an arbitrary name used only in log output; the value is the
|
||||||
|
`module:ClassName` import path.
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[project.entry-points."paperless_ngx.parsers"]
|
||||||
|
my_parser = "my_package.parsers:MyCustomParser"
|
||||||
|
```
|
||||||
|
|
||||||
|
Install your package into the same Python environment as Paperless-ngx (or
|
||||||
|
add it to the Docker image), and the parser will be discovered automatically
|
||||||
|
on the next startup. No configuration changes are needed.
|
||||||
|
|
||||||
|
To verify discovery, check the application logs at startup for a line like:
|
||||||
|
|
||||||
|
```
|
||||||
|
Loaded third-party parser 'My Format Parser' v1.0.0 by Acme Corp (entrypoint: 'my_parser').
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Utilities
|
||||||
|
|
||||||
|
`paperless.parsers.utils` provides helpers you can import directly:
|
||||||
|
|
||||||
|
| Function | Description |
|
||||||
|
| --------------------------------------- | ---------------------------------------------------------------- |
|
||||||
|
| `read_file_handle_unicode_errors(path)` | Read a file as UTF-8, replacing invalid bytes instead of raising |
|
||||||
|
| `get_page_count_for_pdf(path)` | Count pages in a PDF using pikepdf |
|
||||||
|
| `extract_pdf_metadata(path)` | Extract XMP metadata from a PDF as a `list[MetadataEntry]` |
|
||||||
|
|
||||||
|
#### Minimal example
|
||||||
|
|
||||||
|
A complete, working parser for a hypothetical plain-XML format:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
from types import TracebackType
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
|
||||||
|
class XmlDocumentParser:
|
||||||
|
name = "XML Parser"
|
||||||
|
version = "1.0.0"
|
||||||
|
author = "Acme Corp"
|
||||||
|
url = "https://example.com/xml-parser"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
return {"application/xml": ".xml", "text/xml": ".xml"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type: str, filename: str, path: Path | None = None) -> int | None:
|
||||||
|
return 10
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR))
|
||||||
|
self._text: str | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse(self, document_path: Path, mime_type: str, *, produce_archive: bool = True) -> None:
|
||||||
|
try:
|
||||||
|
tree = ET.parse(document_path)
|
||||||
|
self._text = " ".join(tree.getroot().itertext())
|
||||||
|
except ET.ParseError as e:
|
||||||
|
raise ParseError(f"XML parse error: {e}") from e
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
img = Image.new("RGB", (500, 700), color="white")
|
||||||
|
ImageDraw.Draw(img).text((10, 10), "XML Document", fill="black")
|
||||||
|
out = self._tempdir / "thumb.webp"
|
||||||
|
img.save(out, format="WEBP")
|
||||||
|
return out
|
||||||
|
|
||||||
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path: Path, mime_type: str) -> list:
|
||||||
|
return []
|
||||||
|
```
|
||||||
|
|
||||||
|
### Developing date parser plugins
|
||||||
|
|
||||||
Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||||
|
|
||||||
### Creating a Date Parser Plugin
|
#### Creating a Date Parser Plugin
|
||||||
|
|
||||||
To create a custom date parser plugin, you need to:
|
To create a custom date parser plugin, you need to:
|
||||||
|
|
||||||
@@ -492,7 +734,7 @@ To create a custom date parser plugin, you need to:
|
|||||||
2. Implement the required abstract method
|
2. Implement the required abstract method
|
||||||
3. Register your plugin via an entry point
|
3. Register your plugin via an entry point
|
||||||
|
|
||||||
#### 1. Implementing the Parser Class
|
##### 1. Implementing the Parser Class
|
||||||
|
|
||||||
Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
|
Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
|
||||||
|
|
||||||
@@ -532,7 +774,7 @@ class MyDateParserPlugin(DateParserPluginBase):
|
|||||||
yield another_datetime
|
yield another_datetime
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Configuration and Helper Methods
|
##### 2. Configuration and Helper Methods
|
||||||
|
|
||||||
Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
|
Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
|
||||||
|
|
||||||
@@ -565,11 +807,11 @@ def _filter_date(
|
|||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Resource Management (Optional)
|
##### 3. Resource Management (Optional)
|
||||||
|
|
||||||
If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
|
If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
|
||||||
|
|
||||||
#### 4. Registering Your Plugin
|
##### 4. Registering Your Plugin
|
||||||
|
|
||||||
Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
|
Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
|
||||||
|
|
||||||
@@ -580,7 +822,7 @@ my_parser = "my_package.parsers:MyDateParserPlugin"
|
|||||||
|
|
||||||
The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
|
The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
|
||||||
|
|
||||||
### Plugin Discovery
|
#### Plugin Discovery
|
||||||
|
|
||||||
Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
|
Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
|
||||||
|
|
||||||
@@ -591,7 +833,7 @@ Paperless-ngx automatically discovers and loads date parser plugins at runtime.
|
|||||||
|
|
||||||
If multiple plugins are installed, a warning is logged indicating which plugin was selected.
|
If multiple plugins are installed, a warning is logged indicating which plugin was selected.
|
||||||
|
|
||||||
### Example: Simple Date Parser
|
#### Example: Simple Date Parser
|
||||||
|
|
||||||
Here's a minimal example that only looks for ISO 8601 dates:
|
Here's a minimal example that only looks for ISO 8601 dates:
|
||||||
|
|
||||||
@@ -623,3 +865,30 @@ class ISODateParserPlugin(DateParserPluginBase):
|
|||||||
if filtered_date is not None:
|
if filtered_date is not None:
|
||||||
yield filtered_date
|
yield filtered_date
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Using Visual Studio Code devcontainer
|
||||||
|
|
||||||
|
Another easy way to get started with development is to use Visual Studio
|
||||||
|
Code devcontainers. This approach will create a preconfigured development
|
||||||
|
environment with all of the required tools and dependencies.
|
||||||
|
[Learn more about devcontainers](https://code.visualstudio.com/docs/devcontainers/containers).
|
||||||
|
The .devcontainer/vscode/tasks.json and .devcontainer/vscode/launch.json files
|
||||||
|
contain more information about the specific tasks and launch configurations (see the
|
||||||
|
non-standard "description" field).
|
||||||
|
|
||||||
|
To get started:
|
||||||
|
|
||||||
|
1. Clone the repository on your machine and open the Paperless-ngx folder in VS Code.
|
||||||
|
|
||||||
|
2. VS Code will prompt you with "Reopen in container". Do so and wait for the environment to start.
|
||||||
|
|
||||||
|
3. In case your host operating system is Windows:
|
||||||
|
- The Source Control view in Visual Studio Code might show: "The detected Git repository is potentially unsafe as the folder is owned by someone other than the current user." Use "Manage Unsafe Repositories" to fix this.
|
||||||
|
- Git might have detecteded modifications for all files, because Windows is using CRLF line endings. Run `git checkout .` in the containers terminal to fix this issue.
|
||||||
|
|
||||||
|
4. Initialize the project by running the task **Project Setup: Run all Init Tasks**. This
|
||||||
|
will initialize the database tables and create a superuser. Then you can compile the front end
|
||||||
|
for production or run the frontend in debug mode.
|
||||||
|
|
||||||
|
5. The project is ready for debugging, start either run the fullstack debug or individual debug
|
||||||
|
processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
|
||||||
|
|||||||
@@ -104,63 +104,6 @@ Multiple options are combined in a single value:
|
|||||||
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
|
||||||
```
|
```
|
||||||
|
|
||||||
## OCR and Archive File Generation Settings
|
|
||||||
|
|
||||||
The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — old values are not silently honoured; a startup warning is logged if any removed variable is still set in your environment.
|
|
||||||
|
|
||||||
### Removed settings
|
|
||||||
|
|
||||||
| Removed Setting | Replacement |
|
|
||||||
| ------------------------------------------- | --------------------------------------------------------------------- |
|
|
||||||
| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) |
|
|
||||||
| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
|
||||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` |
|
|
||||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) |
|
|
||||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` |
|
|
||||||
|
|
||||||
### What changed and why
|
|
||||||
|
|
||||||
Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives.
|
|
||||||
|
|
||||||
The new settings are independent:
|
|
||||||
|
|
||||||
- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`.
|
|
||||||
- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`.
|
|
||||||
|
|
||||||
### Action Required
|
|
||||||
|
|
||||||
Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# v2: skip OCR when text present, always archive
|
|
||||||
PAPERLESS_OCR_MODE=skip
|
|
||||||
# v3: equivalent (auto is the new default)
|
|
||||||
# No change needed — auto is the default
|
|
||||||
|
|
||||||
# v2: skip OCR when text present, skip archive too
|
|
||||||
PAPERLESS_OCR_MODE=skip_noarchive
|
|
||||||
# v3: equivalent
|
|
||||||
PAPERLESS_OCR_MODE=auto
|
|
||||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
|
||||||
|
|
||||||
# v2: always skip archive
|
|
||||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always
|
|
||||||
# v3: equivalent
|
|
||||||
PAPERLESS_ARCHIVE_FILE_GENERATION=never
|
|
||||||
|
|
||||||
# v2: skip archive only for born-digital docs
|
|
||||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text
|
|
||||||
# v3: equivalent (auto is the new default)
|
|
||||||
PAPERLESS_ARCHIVE_FILE_GENERATION=auto
|
|
||||||
```
|
|
||||||
|
|
||||||
### Remote OCR parser
|
|
||||||
|
|
||||||
If you use the **remote OCR parser** (Azure AI), note that it always produces a
|
|
||||||
searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never`
|
|
||||||
has no effect for documents handled by the remote parser — the archive is produced
|
|
||||||
unconditionally by the remote engine.
|
|
||||||
|
|
||||||
## OpenID Connect Token Endpoint Authentication
|
## OpenID Connect Token Endpoint Authentication
|
||||||
|
|
||||||
Some existing OpenID Connect setups may require an explicit token endpoint authentication method after upgrading to v3.
|
Some existing OpenID Connect setups may require an explicit token endpoint authentication method after upgrading to v3.
|
||||||
|
|||||||
@@ -633,11 +633,12 @@ hardware, but a few settings can improve performance:
|
|||||||
consumption, so you might want to lower these settings (example: 2
|
consumption, so you might want to lower these settings (example: 2
|
||||||
workers and 1 thread to always have some computing power left for
|
workers and 1 thread to always have some computing power left for
|
||||||
other tasks).
|
other tasks).
|
||||||
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider
|
- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider
|
||||||
OCRing your documents before feeding them into Paperless. Some
|
OCRing your documents before feeding them into Paperless. Some
|
||||||
scanners are able to do this!
|
scanners are able to do this!
|
||||||
- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive
|
- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive
|
||||||
file generation entirely, saving disk space at the cost of in-browser PDF/A viewing.
|
file generation for already OCRed documents, or `always` to skip it
|
||||||
|
for all documents.
|
||||||
- If you want to perform OCR on the device, consider using
|
- If you want to perform OCR on the device, consider using
|
||||||
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
||||||
less memory at the expense of slightly worse OCR results.
|
less memory at the expense of slightly worse OCR results.
|
||||||
|
|||||||
@@ -134,9 +134,9 @@ following operations on your documents:
|
|||||||
!!! tip
|
!!! tip
|
||||||
|
|
||||||
This process can be configured to fit your needs. If you don't want
|
This process can be configured to fit your needs. If you don't want
|
||||||
paperless to create archived versions for born-digital documents, set
|
paperless to create archived versions for digital documents, you can
|
||||||
[`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION)
|
configure that by configuring
|
||||||
(the default). To skip archives entirely, use `never`. Please read the
|
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
|
||||||
[relevant section in the documentation](configuration.md#ocr).
|
[relevant section in the documentation](configuration.md#ocr).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
|
|||||||
@@ -50,14 +50,9 @@ from documents.utils import compute_checksum
|
|||||||
from documents.utils import copy_basic_file_stats
|
from documents.utils import copy_basic_file_stats
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.config import OcrConfig
|
|
||||||
from paperless.models import ArchiveFileGenerationChoices
|
|
||||||
from paperless.parsers import ParserContext
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers import ParserProtocol
|
from paperless.parsers import ParserProtocol
|
||||||
from paperless.parsers.registry import get_parser_registry
|
from paperless.parsers.registry import get_parser_registry
|
||||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
|
||||||
from paperless.parsers.utils import extract_pdf_text
|
|
||||||
from paperless.parsers.utils import is_tagged_pdf
|
|
||||||
|
|
||||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||||
|
|
||||||
@@ -110,44 +105,6 @@ class ConsumerStatusShortMessage(StrEnum):
|
|||||||
FAILED = "failed"
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
def should_produce_archive(
|
|
||||||
parser: "ParserProtocol",
|
|
||||||
mime_type: str,
|
|
||||||
document_path: Path,
|
|
||||||
) -> bool:
|
|
||||||
"""Return True if a PDF/A archive should be produced for this document.
|
|
||||||
|
|
||||||
IMPORTANT: *parser* must be an instantiated parser, not the class.
|
|
||||||
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
|
|
||||||
``@property`` methods — accessing them on the class returns the descriptor
|
|
||||||
(always truthy).
|
|
||||||
"""
|
|
||||||
# Must produce a PDF so the frontend can display the original format at all.
|
|
||||||
if parser.requires_pdf_rendition:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Parser cannot produce an archive (e.g. TextDocumentParser).
|
|
||||||
if not parser.can_produce_archive:
|
|
||||||
return False
|
|
||||||
|
|
||||||
generation = OcrConfig().archive_file_generation
|
|
||||||
|
|
||||||
if generation == ArchiveFileGenerationChoices.ALWAYS:
|
|
||||||
return True
|
|
||||||
if generation == ArchiveFileGenerationChoices.NEVER:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
|
|
||||||
if mime_type.startswith("image/"):
|
|
||||||
return True
|
|
||||||
if mime_type == "application/pdf":
|
|
||||||
if is_tagged_pdf(document_path):
|
|
||||||
return False
|
|
||||||
text = extract_pdf_text(document_path)
|
|
||||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class ConsumerPluginMixin:
|
class ConsumerPluginMixin:
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
@@ -481,16 +438,7 @@ class ConsumerPlugin(
|
|||||||
)
|
)
|
||||||
self.log.debug(f"Parsing {self.filename}...")
|
self.log.debug(f"Parsing {self.filename}...")
|
||||||
|
|
||||||
produce_archive = should_produce_archive(
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
document_parser,
|
|
||||||
mime_type,
|
|
||||||
self.working_copy,
|
|
||||||
)
|
|
||||||
document_parser.parse(
|
|
||||||
self.working_copy,
|
|
||||||
mime_type,
|
|
||||||
produce_archive=produce_archive,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||||
self._send_progress(
|
self._send_progress(
|
||||||
@@ -839,7 +787,7 @@ class ConsumerPlugin(
|
|||||||
|
|
||||||
return document
|
return document
|
||||||
|
|
||||||
def apply_overrides(self, document: Document) -> None:
|
def apply_overrides(self, document) -> None:
|
||||||
if self.metadata.correspondent_id:
|
if self.metadata.correspondent_id:
|
||||||
document.correspondent = Correspondent.objects.get(
|
document.correspondent = Correspondent.objects.get(
|
||||||
pk=self.metadata.correspondent_id,
|
pk=self.metadata.correspondent_id,
|
||||||
|
|||||||
@@ -34,7 +34,6 @@ from documents.consumer import AsnCheckPlugin
|
|||||||
from documents.consumer import ConsumerPlugin
|
from documents.consumer import ConsumerPlugin
|
||||||
from documents.consumer import ConsumerPreflightPlugin
|
from documents.consumer import ConsumerPreflightPlugin
|
||||||
from documents.consumer import WorkflowTriggerPlugin
|
from documents.consumer import WorkflowTriggerPlugin
|
||||||
from documents.consumer import should_produce_archive
|
|
||||||
from documents.data_models import ConsumableDocument
|
from documents.data_models import ConsumableDocument
|
||||||
from documents.data_models import DocumentMetadataOverrides
|
from documents.data_models import DocumentMetadataOverrides
|
||||||
from documents.double_sided import CollatePlugin
|
from documents.double_sided import CollatePlugin
|
||||||
@@ -322,16 +321,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
parser.configure(ParserContext())
|
parser.configure(ParserContext())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
produce_archive = should_produce_archive(
|
parser.parse(document.source_path, mime_type)
|
||||||
parser,
|
|
||||||
mime_type,
|
|
||||||
document.source_path,
|
|
||||||
)
|
|
||||||
parser.parse(
|
|
||||||
document.source_path,
|
|
||||||
mime_type,
|
|
||||||
produce_archive=produce_archive,
|
|
||||||
)
|
|
||||||
|
|
||||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
|||||||
"pages": None,
|
"pages": None,
|
||||||
"language": None,
|
"language": None,
|
||||||
"mode": None,
|
"mode": None,
|
||||||
"archive_file_generation": None,
|
"skip_archive_file": None,
|
||||||
"image_dpi": None,
|
"image_dpi": None,
|
||||||
"unpaper_clean": None,
|
"unpaper_clean": None,
|
||||||
"deskew": None,
|
"deskew": None,
|
||||||
|
|||||||
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
|
|||||||
CONSUMER_TAG_BARCODE_SPLIT=True,
|
CONSUMER_TAG_BARCODE_SPLIT=True,
|
||||||
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
||||||
CELERY_TASK_ALWAYS_EAGER=True,
|
CELERY_TASK_ALWAYS_EAGER=True,
|
||||||
OCR_MODE="auto",
|
OCR_MODE="skip",
|
||||||
)
|
)
|
||||||
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -230,11 +230,7 @@ class TestConsumer(
|
|||||||
shutil.copy(src, dst)
|
shutil.copy(src, dst)
|
||||||
return dst
|
return dst
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
|
||||||
FILENAME_FORMAT=None,
|
|
||||||
TIME_ZONE="America/Chicago",
|
|
||||||
ARCHIVE_FILE_GENERATION="always",
|
|
||||||
)
|
|
||||||
def testNormalOperation(self) -> None:
|
def testNormalOperation(self) -> None:
|
||||||
filename = self.get_test_file()
|
filename = self.get_test_file()
|
||||||
|
|
||||||
@@ -633,10 +629,7 @@ class TestConsumer(
|
|||||||
# Database empty
|
# Database empty
|
||||||
self.assertEqual(Document.objects.all().count(), 0)
|
self.assertEqual(Document.objects.all().count(), 0)
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||||
FILENAME_FORMAT="{correspondent}/{title}",
|
|
||||||
ARCHIVE_FILE_GENERATION="always",
|
|
||||||
)
|
|
||||||
def testFilenameHandling(self) -> None:
|
def testFilenameHandling(self) -> None:
|
||||||
with self.get_consumer(
|
with self.get_consumer(
|
||||||
self.get_test_file(),
|
self.get_test_file(),
|
||||||
@@ -653,7 +646,7 @@ class TestConsumer(
|
|||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@mock.patch("documents.consumer.generate_unique_filename")
|
@mock.patch("documents.consumer.generate_unique_filename")
|
||||||
@override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always")
|
@override_settings(FILENAME_FORMAT="{pk}")
|
||||||
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m):
|
||||||
m.side_effect = lambda doc, archive_filename=False: Path(
|
m.side_effect = lambda doc, archive_filename=False: Path(
|
||||||
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"),
|
||||||
@@ -680,10 +673,7 @@ class TestConsumer(
|
|||||||
|
|
||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||||
FILENAME_FORMAT="{correspondent}/{title}",
|
|
||||||
ARCHIVE_FILE_GENERATION="always",
|
|
||||||
)
|
|
||||||
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
@mock.patch("documents.signals.handlers.generate_unique_filename")
|
||||||
def testFilenameHandlingUnstableFormat(self, m) -> None:
|
def testFilenameHandlingUnstableFormat(self, m) -> None:
|
||||||
filenames = ["this", "that", "now this", "i cannot decide"]
|
filenames = ["this", "that", "now this", "i cannot decide"]
|
||||||
@@ -1031,7 +1021,7 @@ class TestConsumer(
|
|||||||
self.assertEqual(Document.objects.count(), 2)
|
self.assertEqual(Document.objects.count(), 2)
|
||||||
self._assert_first_last_send_progress()
|
self._assert_first_last_send_progress()
|
||||||
|
|
||||||
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
|
@override_settings(FILENAME_FORMAT="{title}")
|
||||||
@mock.patch("documents.consumer.get_parser_registry")
|
@mock.patch("documents.consumer.get_parser_registry")
|
||||||
def test_similar_filenames(self, m) -> None:
|
def test_similar_filenames(self, m) -> None:
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
@@ -1142,7 +1132,6 @@ class TestConsumer(
|
|||||||
mock_mail_parser_parse.assert_called_once_with(
|
mock_mail_parser_parse.assert_called_once_with(
|
||||||
consumer.working_copy,
|
consumer.working_copy,
|
||||||
"message/rfc822",
|
"message/rfc822",
|
||||||
produce_archive=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -1290,14 +1279,7 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
|||||||
def test_no_pre_consume_script(self, m) -> None:
|
def test_no_pre_consume_script(self, m) -> None:
|
||||||
with self.get_consumer(self.test_file) as c:
|
with self.get_consumer(self.test_file) as c:
|
||||||
c.run()
|
c.run()
|
||||||
# Verify no pre-consume script subprocess was invoked
|
m.assert_not_called()
|
||||||
# (run_subprocess may still be called by _extract_text_for_archive_check)
|
|
||||||
script_calls = [
|
|
||||||
call
|
|
||||||
for call in m.call_args_list
|
|
||||||
if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
|
|
||||||
]
|
|
||||||
self.assertEqual(script_calls, [])
|
|
||||||
|
|
||||||
@mock.patch("documents.consumer.run_subprocess")
|
@mock.patch("documents.consumer.run_subprocess")
|
||||||
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
|
||||||
@@ -1313,16 +1295,9 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
|||||||
with self.get_consumer(self.test_file) as c:
|
with self.get_consumer(self.test_file) as c:
|
||||||
c.run()
|
c.run()
|
||||||
|
|
||||||
self.assertTrue(m.called)
|
m.assert_called_once()
|
||||||
|
|
||||||
# Find the call that invoked the pre-consume script
|
args, _ = m.call_args
|
||||||
# (run_subprocess may also be called by _extract_text_for_archive_check)
|
|
||||||
script_call = next(
|
|
||||||
call
|
|
||||||
for call in m.call_args_list
|
|
||||||
if call.args and call.args[0] and call.args[0][0] == script.name
|
|
||||||
)
|
|
||||||
args, _ = script_call
|
|
||||||
|
|
||||||
command = args[0]
|
command = args[0]
|
||||||
environment = args[1]
|
environment = args[1]
|
||||||
|
|||||||
@@ -1,189 +0,0 @@
|
|||||||
"""Tests for should_produce_archive()."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from documents.consumer import should_produce_archive
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pytest_mock import MockerFixture
|
|
||||||
|
|
||||||
|
|
||||||
def _parser_instance(
|
|
||||||
*,
|
|
||||||
can_produce: bool = True,
|
|
||||||
requires_rendition: bool = False,
|
|
||||||
) -> MagicMock:
|
|
||||||
"""Return a mock parser instance with the given capability flags."""
|
|
||||||
instance = MagicMock()
|
|
||||||
instance.can_produce_archive = can_produce
|
|
||||||
instance.requires_pdf_rendition = requires_rendition
|
|
||||||
return instance
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def null_app_config(mocker) -> MagicMock:
|
|
||||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
|
||||||
return mocker.MagicMock(
|
|
||||||
output_type=None,
|
|
||||||
pages=None,
|
|
||||||
language=None,
|
|
||||||
mode=None,
|
|
||||||
archive_file_generation=None,
|
|
||||||
image_dpi=None,
|
|
||||||
unpaper_clean=None,
|
|
||||||
deskew=None,
|
|
||||||
rotate_pages=None,
|
|
||||||
rotate_pages_threshold=None,
|
|
||||||
max_image_pixels=None,
|
|
||||||
color_conversion_strategy=None,
|
|
||||||
user_args=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def patch_app_config(mocker, null_app_config):
|
|
||||||
"""Patch BaseConfig._get_config_instance for all tests in this module."""
|
|
||||||
mocker.patch(
|
|
||||||
"paperless.config.BaseConfig._get_config_instance",
|
|
||||||
return_value=null_app_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestShouldProduceArchive:
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("generation", "can_produce", "requires_rendition", "mime", "expected"),
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
"never",
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
"application/pdf",
|
|
||||||
False,
|
|
||||||
id="never-returns-false",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"always",
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
"application/pdf",
|
|
||||||
True,
|
|
||||||
id="always-returns-true",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"never",
|
|
||||||
True,
|
|
||||||
True,
|
|
||||||
"application/pdf",
|
|
||||||
True,
|
|
||||||
id="requires-rendition-overrides-never",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"always",
|
|
||||||
False,
|
|
||||||
False,
|
|
||||||
"text/plain",
|
|
||||||
False,
|
|
||||||
id="cannot-produce-overrides-always",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"always",
|
|
||||||
False,
|
|
||||||
True,
|
|
||||||
"application/pdf",
|
|
||||||
True,
|
|
||||||
id="requires-rendition-wins-even-if-cannot-produce",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"auto",
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
"image/tiff",
|
|
||||||
True,
|
|
||||||
id="auto-image-returns-true",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"auto",
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
"message/rfc822",
|
|
||||||
False,
|
|
||||||
id="auto-non-pdf-non-image-returns-false",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_generation_setting(
|
|
||||||
self,
|
|
||||||
settings,
|
|
||||||
generation: str,
|
|
||||||
can_produce: bool, # noqa: FBT001
|
|
||||||
requires_rendition: bool, # noqa: FBT001
|
|
||||||
mime: str,
|
|
||||||
expected: bool, # noqa: FBT001
|
|
||||||
) -> None:
|
|
||||||
settings.ARCHIVE_FILE_GENERATION = generation
|
|
||||||
parser = _parser_instance(
|
|
||||||
can_produce=can_produce,
|
|
||||||
requires_rendition=requires_rendition,
|
|
||||||
)
|
|
||||||
assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("extracted_text", "expected"),
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
"This is a born-digital PDF with lots of text content. " * 10,
|
|
||||||
False,
|
|
||||||
id="born-digital-long-text-skips-archive",
|
|
||||||
),
|
|
||||||
pytest.param(None, True, id="no-text-scanned-produces-archive"),
|
|
||||||
pytest.param("tiny", True, id="short-text-treated-as-scanned"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_auto_pdf_archive_decision(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
settings,
|
|
||||||
extracted_text: str | None,
|
|
||||||
expected: bool, # noqa: FBT001
|
|
||||||
) -> None:
|
|
||||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
|
||||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
|
|
||||||
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
|
|
||||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
|
||||||
assert (
|
|
||||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
|
||||||
is expected
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_tagged_pdf_skips_archive_in_auto_mode(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
settings,
|
|
||||||
) -> None:
|
|
||||||
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
|
|
||||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
|
||||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
|
||||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
|
||||||
assert (
|
|
||||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
|
||||||
is False
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_tagged_pdf_does_not_call_pdftotext(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
settings,
|
|
||||||
) -> None:
|
|
||||||
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
|
|
||||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
|
||||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
|
||||||
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
|
|
||||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
|
||||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
|
||||||
mock_extract.assert_not_called()
|
|
||||||
@@ -27,10 +27,7 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.management
|
@pytest.mark.management
|
||||||
@override_settings(
|
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||||
FILENAME_FORMAT="{correspondent}/{title}",
|
|
||||||
ARCHIVE_FILE_GENERATION="always",
|
|
||||||
)
|
|
||||||
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||||
def make_models(self):
|
def make_models(self):
|
||||||
return Document.objects.create(
|
return Document.objects.create(
|
||||||
|
|||||||
@@ -232,7 +232,6 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
self.assertEqual(Document.global_objects.count(), 0)
|
self.assertEqual(Document.global_objects.count(), 0)
|
||||||
|
|
||||||
|
|
||||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
|
||||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||||
def test_update_content_maybe_archive_file(self) -> None:
|
def test_update_content_maybe_archive_file(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ import shutil
|
|||||||
import stat
|
import stat
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.checks import Error
|
from django.core.checks import Error
|
||||||
@@ -23,7 +22,7 @@ writeable_hint = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def path_check(var: str, directory: Path) -> list[Error]:
|
def path_check(var, directory: Path) -> list[Error]:
|
||||||
messages: list[Error] = []
|
messages: list[Error] = []
|
||||||
if directory:
|
if directory:
|
||||||
if not directory.is_dir():
|
if not directory.is_dir():
|
||||||
@@ -60,7 +59,7 @@ def path_check(var: str, directory: Path) -> list[Error]:
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||||
"""
|
"""
|
||||||
Check the various paths for existence, readability and writeability
|
Check the various paths for existence, readability and writeability
|
||||||
"""
|
"""
|
||||||
@@ -74,7 +73,7 @@ def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
def binaries_check(app_configs, **kwargs):
|
||||||
"""
|
"""
|
||||||
Paperless requires the existence of a few binaries, so we do some checks
|
Paperless requires the existence of a few binaries, so we do some checks
|
||||||
for those here.
|
for those here.
|
||||||
@@ -94,7 +93,7 @@ def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
def debug_mode_check(app_configs, **kwargs):
|
||||||
if settings.DEBUG:
|
if settings.DEBUG:
|
||||||
return [
|
return [
|
||||||
Warning(
|
Warning(
|
||||||
@@ -110,7 +109,7 @@ def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
|
def settings_values_check(app_configs, **kwargs):
|
||||||
"""
|
"""
|
||||||
Validates at least some of the user provided settings
|
Validates at least some of the user provided settings
|
||||||
"""
|
"""
|
||||||
@@ -133,14 +132,23 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
|
|||||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||||
)
|
)
|
||||||
|
|
||||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||||
|
|
||||||
if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}:
|
if settings.OCR_MODE == "skip_noarchive":
|
||||||
|
msgs.append(
|
||||||
|
Warning(
|
||||||
|
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||||
|
"removed in a future version. Please use "
|
||||||
|
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||||
msgs.append(
|
msgs.append(
|
||||||
Error(
|
Error(
|
||||||
"PAPERLESS_ARCHIVE_FILE_GENERATION setting "
|
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||||
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
|
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -183,7 +191,7 @@ def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warni
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
def audit_log_check(app_configs, **kwargs):
|
||||||
db_conn = connections["default"]
|
db_conn = connections["default"]
|
||||||
all_tables = db_conn.introspection.table_names()
|
all_tables = db_conn.introspection.table_names()
|
||||||
result = []
|
result = []
|
||||||
@@ -295,42 +303,7 @@ def check_deprecated_db_settings(
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def check_deprecated_v2_ocr_env_vars(
|
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||||
app_configs: object,
|
|
||||||
**kwargs: object,
|
|
||||||
) -> list[Warning]:
|
|
||||||
"""Warn when deprecated v2 OCR environment variables are set.
|
|
||||||
|
|
||||||
Users upgrading from v2 may still have these in their environment or
|
|
||||||
config files, where they are now silently ignored.
|
|
||||||
"""
|
|
||||||
warnings: list[Warning] = []
|
|
||||||
|
|
||||||
if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"):
|
|
||||||
warnings.append(
|
|
||||||
Warning(
|
|
||||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. "
|
|
||||||
"Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.",
|
|
||||||
id="paperless.W002",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "")
|
|
||||||
if ocr_mode in {"skip", "skip_noarchive"}:
|
|
||||||
warnings.append(
|
|
||||||
Warning(
|
|
||||||
f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. "
|
|
||||||
f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never "
|
|
||||||
f"if you used skip_noarchive) instead.",
|
|
||||||
id="paperless.W003",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return warnings
|
|
||||||
|
|
||||||
|
|
||||||
@register()
|
|
||||||
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
|
|
||||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||||
):
|
):
|
||||||
@@ -356,7 +329,7 @@ def get_tesseract_langs():
|
|||||||
|
|
||||||
|
|
||||||
@register()
|
@register()
|
||||||
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
|
def check_default_language_available(app_configs, **kwargs):
|
||||||
errs = []
|
errs = []
|
||||||
|
|
||||||
if not settings.OCR_LANGUAGE:
|
if not settings.OCR_LANGUAGE:
|
||||||
|
|||||||
@@ -4,11 +4,6 @@ import json
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from paperless.models import ApplicationConfiguration
|
from paperless.models import ApplicationConfiguration
|
||||||
from paperless.models import ArchiveFileGenerationChoices
|
|
||||||
from paperless.models import CleanChoices
|
|
||||||
from paperless.models import ColorConvertChoices
|
|
||||||
from paperless.models import ModeChoices
|
|
||||||
from paperless.models import OutputTypeChoices
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@@ -33,7 +28,7 @@ class OutputTypeConfig(BaseConfig):
|
|||||||
Almost all parsers care about the chosen PDF output format
|
Almost all parsers care about the chosen PDF output format
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_type: OutputTypeChoices = dataclasses.field(init=False)
|
output_type: str = dataclasses.field(init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
app_config = self._get_config_instance()
|
app_config = self._get_config_instance()
|
||||||
@@ -50,17 +45,15 @@ class OcrConfig(OutputTypeConfig):
|
|||||||
|
|
||||||
pages: int | None = dataclasses.field(init=False)
|
pages: int | None = dataclasses.field(init=False)
|
||||||
language: str = dataclasses.field(init=False)
|
language: str = dataclasses.field(init=False)
|
||||||
mode: ModeChoices = dataclasses.field(init=False)
|
mode: str = dataclasses.field(init=False)
|
||||||
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
|
skip_archive_file: str = dataclasses.field(init=False)
|
||||||
init=False,
|
|
||||||
)
|
|
||||||
image_dpi: int | None = dataclasses.field(init=False)
|
image_dpi: int | None = dataclasses.field(init=False)
|
||||||
clean: CleanChoices = dataclasses.field(init=False)
|
clean: str = dataclasses.field(init=False)
|
||||||
deskew: bool = dataclasses.field(init=False)
|
deskew: bool = dataclasses.field(init=False)
|
||||||
rotate: bool = dataclasses.field(init=False)
|
rotate: bool = dataclasses.field(init=False)
|
||||||
rotate_threshold: float = dataclasses.field(init=False)
|
rotate_threshold: float = dataclasses.field(init=False)
|
||||||
max_image_pixel: float | None = dataclasses.field(init=False)
|
max_image_pixel: float | None = dataclasses.field(init=False)
|
||||||
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
|
color_conversion_strategy: str = dataclasses.field(init=False)
|
||||||
user_args: dict[str, str] | None = dataclasses.field(init=False)
|
user_args: dict[str, str] | None = dataclasses.field(init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
@@ -71,8 +64,8 @@ class OcrConfig(OutputTypeConfig):
|
|||||||
self.pages = app_config.pages or settings.OCR_PAGES
|
self.pages = app_config.pages or settings.OCR_PAGES
|
||||||
self.language = app_config.language or settings.OCR_LANGUAGE
|
self.language = app_config.language or settings.OCR_LANGUAGE
|
||||||
self.mode = app_config.mode or settings.OCR_MODE
|
self.mode = app_config.mode or settings.OCR_MODE
|
||||||
self.archive_file_generation = (
|
self.skip_archive_file = (
|
||||||
app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION
|
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||||
)
|
)
|
||||||
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
|
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
|
||||||
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
|
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
|
||||||
|
|||||||
@@ -1,44 +0,0 @@
|
|||||||
# Generated by Django 5.2.12 on 2026-03-26 20:31
|
|
||||||
|
|
||||||
from django.db import migrations
|
|
||||||
from django.db import models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
dependencies = [
|
|
||||||
("paperless", "0007_optimize_integer_field_sizes"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.RemoveField(
|
|
||||||
model_name="applicationconfiguration",
|
|
||||||
name="skip_archive_file",
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name="applicationconfiguration",
|
|
||||||
name="archive_file_generation",
|
|
||||||
field=models.CharField(
|
|
||||||
blank=True,
|
|
||||||
choices=[("auto", "auto"), ("always", "always"), ("never", "never")],
|
|
||||||
max_length=8,
|
|
||||||
null=True,
|
|
||||||
verbose_name="Controls archive file generation",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name="applicationconfiguration",
|
|
||||||
name="mode",
|
|
||||||
field=models.CharField(
|
|
||||||
blank=True,
|
|
||||||
choices=[
|
|
||||||
("auto", "auto"),
|
|
||||||
("force", "force"),
|
|
||||||
("redo", "redo"),
|
|
||||||
("off", "off"),
|
|
||||||
],
|
|
||||||
max_length=16,
|
|
||||||
null=True,
|
|
||||||
verbose_name="Sets the OCR mode",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
|
|||||||
and our own custom setting
|
and our own custom setting
|
||||||
"""
|
"""
|
||||||
|
|
||||||
AUTO = ("auto", _("auto"))
|
SKIP = ("skip", _("skip"))
|
||||||
FORCE = ("force", _("force"))
|
|
||||||
REDO = ("redo", _("redo"))
|
REDO = ("redo", _("redo"))
|
||||||
OFF = ("off", _("off"))
|
FORCE = ("force", _("force"))
|
||||||
|
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||||
|
|
||||||
|
|
||||||
class ArchiveFileGenerationChoices(models.TextChoices):
|
class ArchiveFileChoices(models.TextChoices):
|
||||||
"""
|
"""
|
||||||
Settings to control creation of an archive PDF file
|
Settings to control creation of an archive PDF file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
AUTO = ("auto", _("auto"))
|
|
||||||
ALWAYS = ("always", _("always"))
|
|
||||||
NEVER = ("never", _("never"))
|
NEVER = ("never", _("never"))
|
||||||
|
WITH_TEXT = ("with_text", _("with_text"))
|
||||||
|
ALWAYS = ("always", _("always"))
|
||||||
|
|
||||||
|
|
||||||
class CleanChoices(models.TextChoices):
|
class CleanChoices(models.TextChoices):
|
||||||
@@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
|||||||
choices=ModeChoices.choices,
|
choices=ModeChoices.choices,
|
||||||
)
|
)
|
||||||
|
|
||||||
archive_file_generation = models.CharField(
|
skip_archive_file = models.CharField(
|
||||||
verbose_name=_("Controls archive file generation"),
|
verbose_name=_("Controls the generation of an archive file"),
|
||||||
null=True,
|
null=True,
|
||||||
blank=True,
|
blank=True,
|
||||||
max_length=8,
|
max_length=16,
|
||||||
choices=ArchiveFileGenerationChoices.choices,
|
choices=ArchiveFileChoices.choices,
|
||||||
)
|
)
|
||||||
|
|
||||||
image_dpi = models.PositiveSmallIntegerField(
|
image_dpi = models.PositiveSmallIntegerField(
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib.resources
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -9,8 +8,6 @@ import tempfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import Final
|
|
||||||
from typing import NoReturn
|
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@@ -21,11 +18,9 @@ from documents.parsers import make_thumbnail_from_pdf
|
|||||||
from documents.utils import maybe_override_pixel_limit
|
from documents.utils import maybe_override_pixel_limit
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.config import OcrConfig
|
from paperless.config import OcrConfig
|
||||||
|
from paperless.models import ArchiveFileChoices
|
||||||
from paperless.models import CleanChoices
|
from paperless.models import CleanChoices
|
||||||
from paperless.models import ModeChoices
|
from paperless.models import ModeChoices
|
||||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
|
||||||
from paperless.parsers.utils import extract_pdf_text
|
|
||||||
from paperless.parsers.utils import is_tagged_pdf
|
|
||||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||||
from paperless.version import __full_version_str__
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
@@ -38,11 +33,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||||
|
|
||||||
_SRGB_ICC_DATA: Final[bytes] = (
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
|
||||||
)
|
|
||||||
|
|
||||||
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
|
|
||||||
"application/pdf": ".pdf",
|
"application/pdf": ".pdf",
|
||||||
"image/jpeg": ".jpg",
|
"image/jpeg": ".jpg",
|
||||||
"image/png": ".png",
|
"image/png": ".png",
|
||||||
@@ -108,7 +99,7 @@ class RasterisedDocumentParser:
|
|||||||
# Lifecycle
|
# Lifecycle
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
def __init__(self, logging_group: object | None = None) -> None:
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
self.tempdir = Path(
|
self.tempdir = Path(
|
||||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
@@ -242,7 +233,7 @@ class RasterisedDocumentParser:
|
|||||||
if (
|
if (
|
||||||
sidecar_file is not None
|
sidecar_file is not None
|
||||||
and sidecar_file.is_file()
|
and sidecar_file.is_file()
|
||||||
and self.settings.mode != ModeChoices.REDO
|
and self.settings.mode != "redo"
|
||||||
):
|
):
|
||||||
text = read_file_handle_unicode_errors(sidecar_file)
|
text = read_file_handle_unicode_errors(sidecar_file)
|
||||||
|
|
||||||
@@ -259,7 +250,36 @@ class RasterisedDocumentParser:
|
|||||||
if not Path(pdf_file).is_file():
|
if not Path(pdf_file).is_file():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
|
try:
|
||||||
|
text = None
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w+",
|
||||||
|
dir=self.tempdir,
|
||||||
|
) as tmp:
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
"pdftotext",
|
||||||
|
"-q",
|
||||||
|
"-layout",
|
||||||
|
"-enc",
|
||||||
|
"UTF-8",
|
||||||
|
str(pdf_file),
|
||||||
|
tmp.name,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||||
|
|
||||||
|
return post_process_text(text)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# If pdftotext fails, fall back to OCR.
|
||||||
|
self.log.warning(
|
||||||
|
"Error while getting text from PDF document with pdftotext",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
# probably not a PDF file.
|
||||||
|
return None
|
||||||
|
|
||||||
def construct_ocrmypdf_parameters(
|
def construct_ocrmypdf_parameters(
|
||||||
self,
|
self,
|
||||||
@@ -269,7 +289,6 @@ class RasterisedDocumentParser:
|
|||||||
sidecar_file: Path,
|
sidecar_file: Path,
|
||||||
*,
|
*,
|
||||||
safe_fallback: bool = False,
|
safe_fallback: bool = False,
|
||||||
skip_text: bool = False,
|
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
ocrmypdf_args: dict[str, Any] = {
|
ocrmypdf_args: dict[str, Any] = {
|
||||||
"input_file_or_options": input_file,
|
"input_file_or_options": input_file,
|
||||||
@@ -288,14 +307,15 @@ class RasterisedDocumentParser:
|
|||||||
self.settings.color_conversion_strategy
|
self.settings.color_conversion_strategy
|
||||||
)
|
)
|
||||||
|
|
||||||
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
|
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||||
ocrmypdf_args["force_ocr"] = True
|
ocrmypdf_args["force_ocr"] = True
|
||||||
|
elif self.settings.mode in {
|
||||||
|
ModeChoices.SKIP,
|
||||||
|
ModeChoices.SKIP_NO_ARCHIVE,
|
||||||
|
}:
|
||||||
|
ocrmypdf_args["skip_text"] = True
|
||||||
elif self.settings.mode == ModeChoices.REDO:
|
elif self.settings.mode == ModeChoices.REDO:
|
||||||
ocrmypdf_args["redo_ocr"] = True
|
ocrmypdf_args["redo_ocr"] = True
|
||||||
elif skip_text or self.settings.mode == ModeChoices.OFF:
|
|
||||||
ocrmypdf_args["skip_text"] = True
|
|
||||||
elif self.settings.mode == ModeChoices.AUTO:
|
|
||||||
pass # no extra flag: normal OCR (text not found case)
|
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||||
|
|
||||||
@@ -380,74 +400,6 @@ class RasterisedDocumentParser:
|
|||||||
|
|
||||||
return ocrmypdf_args
|
return ocrmypdf_args
|
||||||
|
|
||||||
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
|
|
||||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
|
||||||
|
|
||||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
|
||||||
PDF/A-2b conformance metadata.
|
|
||||||
|
|
||||||
No Tesseract and no Ghostscript are invoked.
|
|
||||||
"""
|
|
||||||
import img2pdf
|
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
|
|
||||||
try:
|
|
||||||
layout_fun = None
|
|
||||||
if self.settings.image_dpi is not None:
|
|
||||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
|
|
||||||
(self.settings.image_dpi, self.settings.image_dpi),
|
|
||||||
)
|
|
||||||
plain_pdf_path.write_bytes(
|
|
||||||
img2pdf.convert(str(document_path), layout_fun=layout_fun),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ParseError(
|
|
||||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
|
||||||
) from e
|
|
||||||
|
|
||||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
|
||||||
try:
|
|
||||||
with pikepdf.open(plain_pdf_path) as pdf:
|
|
||||||
cs = pdf.make_stream(_SRGB_ICC_DATA)
|
|
||||||
cs["/N"] = 3
|
|
||||||
output_intent = pikepdf.Dictionary(
|
|
||||||
Type=pikepdf.Name("/OutputIntent"),
|
|
||||||
S=pikepdf.Name("/GTS_PDFA1"),
|
|
||||||
OutputConditionIdentifier=pikepdf.String("sRGB"),
|
|
||||||
DestOutputProfile=cs,
|
|
||||||
)
|
|
||||||
pdf.Root["/OutputIntents"] = pdf.make_indirect(
|
|
||||||
pikepdf.Array([output_intent]),
|
|
||||||
)
|
|
||||||
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
|
|
||||||
meta["pdfaid:part"] = "2"
|
|
||||||
meta["pdfaid:conformance"] = "B"
|
|
||||||
pdf.save(pdfa_path)
|
|
||||||
except Exception as e:
|
|
||||||
self.log.warning(
|
|
||||||
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
|
|
||||||
)
|
|
||||||
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
|
|
||||||
|
|
||||||
return pdfa_path
|
|
||||||
|
|
||||||
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
|
|
||||||
"""Log context for Ghostscript failures and raise ParseError.
|
|
||||||
|
|
||||||
Called from the SubprocessOutputError handlers in parse() to avoid
|
|
||||||
duplicating the Ghostscript hint and re-raise logic.
|
|
||||||
"""
|
|
||||||
if "Ghostscript PDF/A rendering" in str(e):
|
|
||||||
self.log.warning(
|
|
||||||
"Ghostscript PDF/A rendering failed, consider setting "
|
|
||||||
"PAPERLESS_OCR_USER_ARGS: "
|
|
||||||
"'{\"continue_on_soft_render_error\": true}'",
|
|
||||||
)
|
|
||||||
raise ParseError(
|
|
||||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
|
||||||
) from e
|
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self,
|
self,
|
||||||
document_path: Path,
|
document_path: Path,
|
||||||
@@ -457,94 +409,57 @@ class RasterisedDocumentParser:
|
|||||||
) -> None:
|
) -> None:
|
||||||
# This forces tesseract to use one core per page.
|
# This forces tesseract to use one core per page.
|
||||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
|
VALID_TEXT_LENGTH = 50
|
||||||
|
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
text_original = self.extract_text(None, document_path)
|
||||||
|
original_has_text = (
|
||||||
|
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
text_original = None
|
||||||
|
original_has_text = False
|
||||||
|
|
||||||
|
# If the original has text, and the user doesn't want an archive,
|
||||||
|
# we're done here
|
||||||
|
skip_archive_for_text = (
|
||||||
|
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||||
|
or self.settings.skip_archive_file
|
||||||
|
in {
|
||||||
|
ArchiveFileChoices.WITH_TEXT,
|
||||||
|
ArchiveFileChoices.ALWAYS,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if skip_archive_for_text and original_has_text:
|
||||||
|
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||||
|
self.text = text_original
|
||||||
|
return
|
||||||
|
|
||||||
|
# Either no text was in the original or there should be an archive
|
||||||
|
# file created, so OCR the file and create an archive with any
|
||||||
|
# text located via OCR
|
||||||
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
from ocrmypdf import EncryptedPdfError
|
from ocrmypdf import EncryptedPdfError
|
||||||
from ocrmypdf import InputFileError
|
from ocrmypdf import InputFileError
|
||||||
from ocrmypdf import SubprocessOutputError
|
from ocrmypdf import SubprocessOutputError
|
||||||
from ocrmypdf.exceptions import DigitalSignatureError
|
from ocrmypdf.exceptions import DigitalSignatureError
|
||||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
|
||||||
|
|
||||||
if mime_type == "application/pdf":
|
|
||||||
text_original = self.extract_text(None, document_path)
|
|
||||||
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
|
|
||||||
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
text_original = None
|
|
||||||
original_has_text = False
|
|
||||||
|
|
||||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
|
||||||
if self.settings.mode == ModeChoices.OFF:
|
|
||||||
if not produce_archive:
|
|
||||||
self.text = text_original or ""
|
|
||||||
return
|
|
||||||
if self.is_image(mime_type):
|
|
||||||
try:
|
|
||||||
self.archive_path = self._convert_image_to_pdfa(
|
|
||||||
document_path,
|
|
||||||
)
|
|
||||||
self.text = ""
|
|
||||||
except Exception as e:
|
|
||||||
raise ParseError(
|
|
||||||
f"Image to PDF/A conversion failed: {e!s}",
|
|
||||||
) from e
|
|
||||||
return
|
|
||||||
# PDFs in off mode: PDF/A conversion only via skip_text
|
|
||||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||||
args = self.construct_ocrmypdf_parameters(
|
|
||||||
document_path,
|
|
||||||
mime_type,
|
|
||||||
archive_path,
|
|
||||||
sidecar_file,
|
|
||||||
skip_text=True,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
self.log.debug(
|
|
||||||
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
|
|
||||||
)
|
|
||||||
ocrmypdf.ocr(**args)
|
|
||||||
self.archive_path = archive_path
|
|
||||||
self.text = self.extract_text(None, archive_path) or text_original or ""
|
|
||||||
except SubprocessOutputError as e:
|
|
||||||
self._handle_subprocess_output_error(e)
|
|
||||||
except Exception as e:
|
|
||||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
|
||||||
return
|
|
||||||
|
|
||||||
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
|
|
||||||
if (
|
|
||||||
self.settings.mode == ModeChoices.AUTO
|
|
||||||
and original_has_text
|
|
||||||
and not produce_archive
|
|
||||||
):
|
|
||||||
self.log.debug(
|
|
||||||
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
|
|
||||||
)
|
|
||||||
self.text = text_original
|
|
||||||
return
|
|
||||||
|
|
||||||
# --- All other paths: run ocrmypdf ---
|
|
||||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
|
||||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
|
||||||
|
|
||||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
|
||||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
|
||||||
|
|
||||||
args = self.construct_ocrmypdf_parameters(
|
args = self.construct_ocrmypdf_parameters(
|
||||||
document_path,
|
document_path,
|
||||||
mime_type,
|
mime_type,
|
||||||
archive_path,
|
archive_path,
|
||||||
sidecar_file,
|
sidecar_file,
|
||||||
skip_text=skip_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||||
ocrmypdf.ocr(**args)
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
if produce_archive:
|
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||||
self.archive_path = archive_path
|
self.archive_path = archive_path
|
||||||
|
|
||||||
self.text = self.extract_text(sidecar_file, archive_path)
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
@@ -559,8 +474,16 @@ class RasterisedDocumentParser:
|
|||||||
if original_has_text:
|
if original_has_text:
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
except SubprocessOutputError as e:
|
except SubprocessOutputError as e:
|
||||||
self._handle_subprocess_output_error(e)
|
if "Ghostscript PDF/A rendering" in str(e):
|
||||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
self.log.warning(
|
||||||
|
"Ghostscript PDF/A rendering failed, consider setting "
|
||||||
|
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ParseError(
|
||||||
|
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||||
|
) from e
|
||||||
|
except (NoTextFoundException, InputFileError) as e:
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
f"Encountered an error while running OCR: {e!s}. "
|
f"Encountered an error while running OCR: {e!s}. "
|
||||||
f"Attempting force OCR to get the text.",
|
f"Attempting force OCR to get the text.",
|
||||||
@@ -569,6 +492,8 @@ class RasterisedDocumentParser:
|
|||||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||||
|
|
||||||
|
# Attempt to run OCR with safe settings.
|
||||||
|
|
||||||
args = self.construct_ocrmypdf_parameters(
|
args = self.construct_ocrmypdf_parameters(
|
||||||
document_path,
|
document_path,
|
||||||
mime_type,
|
mime_type,
|
||||||
@@ -580,18 +505,25 @@ class RasterisedDocumentParser:
|
|||||||
try:
|
try:
|
||||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||||
ocrmypdf.ocr(**args)
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
|
# Don't return the archived file here, since this file
|
||||||
|
# is bigger and blurry due to --force-ocr.
|
||||||
|
|
||||||
self.text = self.extract_text(
|
self.text = self.extract_text(
|
||||||
sidecar_file_fallback,
|
sidecar_file_fallback,
|
||||||
archive_path_fallback,
|
archive_path_fallback,
|
||||||
)
|
)
|
||||||
if produce_archive:
|
|
||||||
self.archive_path = archive_path_fallback
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# If this fails, we have a serious issue at hand.
|
||||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# Anything else is probably serious.
|
||||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
|
# As a last resort, if we still don't have any text for any reason,
|
||||||
|
# try to extract the text from the original document.
|
||||||
if not self.text:
|
if not self.text:
|
||||||
if original_has_text:
|
if original_has_text:
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
|
|||||||
@@ -10,105 +10,15 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from typing import Final
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from paperless.parsers import MetadataEntry
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.parsers.utils")
|
logger = logging.getLogger("paperless.parsers.utils")
|
||||||
|
|
||||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
|
||||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
|
||||||
PDF_TEXT_MIN_LENGTH: Final[int] = 50
|
|
||||||
|
|
||||||
|
|
||||||
def is_tagged_pdf(
|
|
||||||
path: Path,
|
|
||||||
log: logging.Logger | None = None,
|
|
||||||
) -> bool:
|
|
||||||
"""Return True if the PDF declares itself as tagged (born-digital indicator).
|
|
||||||
|
|
||||||
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
|
|
||||||
with ``/Marked true`` in the document root. This is a reliable signal
|
|
||||||
that the document has a logical structure and embedded text — running OCR
|
|
||||||
on it is unnecessary and archive generation can be skipped.
|
|
||||||
|
|
||||||
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
path:
|
|
||||||
Absolute path to the PDF file.
|
|
||||||
log:
|
|
||||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bool
|
|
||||||
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
|
|
||||||
"""
|
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
_log = log or logger
|
|
||||||
try:
|
|
||||||
with pikepdf.open(path) as pdf:
|
|
||||||
mark_info = pdf.Root.get("/MarkInfo")
|
|
||||||
if mark_info is None:
|
|
||||||
return False
|
|
||||||
return bool(mark_info.get("/Marked", False))
|
|
||||||
except Exception:
|
|
||||||
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pdf_text(
|
|
||||||
path: Path,
|
|
||||||
log: logging.Logger | None = None,
|
|
||||||
) -> str | None:
|
|
||||||
"""Run pdftotext on *path* and return the extracted text, or None on failure.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
path:
|
|
||||||
Absolute path to the PDF file.
|
|
||||||
log:
|
|
||||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
str | None
|
|
||||||
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
|
|
||||||
"""
|
|
||||||
from documents.utils import run_subprocess
|
|
||||||
|
|
||||||
_log = log or logger
|
|
||||||
try:
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
out_path = Path(tmpdir) / "text.txt"
|
|
||||||
run_subprocess(
|
|
||||||
[
|
|
||||||
"pdftotext",
|
|
||||||
"-q",
|
|
||||||
"-layout",
|
|
||||||
"-enc",
|
|
||||||
"UTF-8",
|
|
||||||
str(path),
|
|
||||||
str(out_path),
|
|
||||||
],
|
|
||||||
logger=_log,
|
|
||||||
)
|
|
||||||
text = read_file_handle_unicode_errors(out_path, log=_log)
|
|
||||||
return text or None
|
|
||||||
except Exception:
|
|
||||||
_log.warning(
|
|
||||||
"Error while getting text from PDF document with pdftotext",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def read_file_handle_unicode_errors(
|
def read_file_handle_unicode_errors(
|
||||||
filepath: Path,
|
filepath: Path,
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ from paperless.settings.custom import parse_hosting_settings
|
|||||||
from paperless.settings.custom import parse_ignore_dates
|
from paperless.settings.custom import parse_ignore_dates
|
||||||
from paperless.settings.custom import parse_redis_url
|
from paperless.settings.custom import parse_redis_url
|
||||||
from paperless.settings.parsers import get_bool_from_env
|
from paperless.settings.parsers import get_bool_from_env
|
||||||
from paperless.settings.parsers import get_choice_from_env
|
|
||||||
from paperless.settings.parsers import get_float_from_env
|
from paperless.settings.parsers import get_float_from_env
|
||||||
from paperless.settings.parsers import get_int_from_env
|
from paperless.settings.parsers import get_int_from_env
|
||||||
from paperless.settings.parsers import get_list_from_env
|
from paperless.settings.parsers import get_list_from_env
|
||||||
@@ -875,17 +874,10 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
|||||||
# OCRmyPDF --output-type options are available.
|
# OCRmyPDF --output-type options are available.
|
||||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||||
|
|
||||||
OCR_MODE = get_choice_from_env(
|
# skip. redo, force
|
||||||
"PAPERLESS_OCR_MODE",
|
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||||
{"auto", "force", "redo", "off"},
|
|
||||||
default="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
ARCHIVE_FILE_GENERATION = get_choice_from_env(
|
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||||
"PAPERLESS_ARCHIVE_FILE_GENERATION",
|
|
||||||
{"auto", "always", "never"},
|
|
||||||
default="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
||||||
|
|
||||||
|
|||||||
@@ -708,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
|
|||||||
pages=None,
|
pages=None,
|
||||||
language=None,
|
language=None,
|
||||||
mode=None,
|
mode=None,
|
||||||
archive_file_generation=None,
|
skip_archive_file=None,
|
||||||
image_dpi=None,
|
image_dpi=None,
|
||||||
unpaper_clean=None,
|
unpaper_clean=None,
|
||||||
deskew=None,
|
deskew=None,
|
||||||
|
|||||||
@@ -1,436 +0,0 @@
|
|||||||
"""
|
|
||||||
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
|
|
||||||
|
|
||||||
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
|
|
||||||
installation and execute quickly. The intent is to verify the *control flow*
|
|
||||||
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
|
|
||||||
not to test OCRmyPDF itself.
|
|
||||||
|
|
||||||
Fixtures are pulled from conftest.py in this package.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pytest_mock import MockerFixture
|
|
||||||
|
|
||||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
|
|
||||||
_SHORT_TEXT = "Hi." # <50 chars
|
|
||||||
|
|
||||||
|
|
||||||
def _make_extract_text(text: str | None):
|
|
||||||
"""Return a side_effect function for ``extract_text`` that returns *text*."""
|
|
||||||
|
|
||||||
def _extract(sidecar_file, pdf_file):
|
|
||||||
return text
|
|
||||||
|
|
||||||
return _extract
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# AUTO mode — PDF with sufficient text layer
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestAutoModeWithText:
|
|
||||||
"""AUTO mode, original PDF has detectable text (>50 chars)."""
|
|
||||||
|
|
||||||
def test_auto_text_no_archive_skips_ocrmypdf(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_digital_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- AUTO mode, produce_archive=False
|
|
||||||
- PDF with text > VALID_TEXT_LENGTH
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr is NOT called (early return path)
|
|
||||||
- archive_path remains None
|
|
||||||
- text is set from the original
|
|
||||||
"""
|
|
||||||
# Patch extract_text to return long text (simulating detectable text layer)
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
simple_digital_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
|
||||||
|
|
||||||
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_digital_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- AUTO mode, produce_archive=True
|
|
||||||
- PDF with text > VALID_TEXT_LENGTH
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr IS called with skip_text=True
|
|
||||||
- archive_path is set
|
|
||||||
"""
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
simple_digital_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_called_once()
|
|
||||||
call_kwargs = mock_ocr.call_args.kwargs
|
|
||||||
assert call_kwargs.get("skip_text") is True
|
|
||||||
assert "force_ocr" not in call_kwargs
|
|
||||||
assert "redo_ocr" not in call_kwargs
|
|
||||||
assert tesseract_parser.archive_path is not None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# AUTO mode — PDF without text layer (or too short)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestAutoModeNoText:
|
|
||||||
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
|
|
||||||
|
|
||||||
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
multi_page_images_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- AUTO mode, produce_archive=True
|
|
||||||
- PDF with no text (or text <= VALID_TEXT_LENGTH)
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
|
|
||||||
- archive_path is set (since produce_archive=True)
|
|
||||||
"""
|
|
||||||
# Return "no text" for the original; return real text for archive
|
|
||||||
extract_call_count = 0
|
|
||||||
|
|
||||||
def _extract_side(sidecar_file, pdf_file):
|
|
||||||
nonlocal extract_call_count
|
|
||||||
extract_call_count += 1
|
|
||||||
if extract_call_count == 1:
|
|
||||||
return None # original has no text
|
|
||||||
return _LONG_TEXT # text from archive after OCR
|
|
||||||
|
|
||||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
multi_page_images_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_called_once()
|
|
||||||
call_kwargs = mock_ocr.call_args.kwargs
|
|
||||||
assert "skip_text" not in call_kwargs
|
|
||||||
assert "force_ocr" not in call_kwargs
|
|
||||||
assert "redo_ocr" not in call_kwargs
|
|
||||||
assert tesseract_parser.archive_path is not None
|
|
||||||
|
|
||||||
def test_auto_no_text_no_archive_calls_ocrmypdf(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
multi_page_images_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- AUTO mode, produce_archive=False
|
|
||||||
- PDF with no text
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr IS called (no early return since no text detected)
|
|
||||||
- archive_path is NOT set (produce_archive=False)
|
|
||||||
"""
|
|
||||||
extract_call_count = 0
|
|
||||||
|
|
||||||
def _extract_side(sidecar_file, pdf_file):
|
|
||||||
nonlocal extract_call_count
|
|
||||||
extract_call_count += 1
|
|
||||||
if extract_call_count == 1:
|
|
||||||
return None
|
|
||||||
return _LONG_TEXT
|
|
||||||
|
|
||||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
multi_page_images_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_called_once()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# OFF mode — PDF
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestOffModePdf:
|
|
||||||
"""OCR_MODE=off, document is a PDF."""
|
|
||||||
|
|
||||||
def test_off_no_archive_returns_pdftotext(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_digital_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- OFF mode, produce_archive=False
|
|
||||||
- PDF with text
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr is NOT called
|
|
||||||
- archive_path is None
|
|
||||||
- text comes from pdftotext (extract_text)
|
|
||||||
"""
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "off"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
simple_digital_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
|
||||||
|
|
||||||
def test_off_with_archive_calls_ocrmypdf_skip_text(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_digital_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- OFF mode, produce_archive=True
|
|
||||||
- PDF document
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
|
|
||||||
- archive_path is set
|
|
||||||
"""
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "off"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
simple_digital_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_called_once()
|
|
||||||
call_kwargs = mock_ocr.call_args.kwargs
|
|
||||||
assert call_kwargs.get("skip_text") is True
|
|
||||||
assert "force_ocr" not in call_kwargs
|
|
||||||
assert "redo_ocr" not in call_kwargs
|
|
||||||
assert tesseract_parser.archive_path is not None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# OFF mode — image
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestOffModeImage:
|
|
||||||
"""OCR_MODE=off, document is an image (PNG)."""
|
|
||||||
|
|
||||||
def test_off_image_no_archive_no_ocrmypdf(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_png_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- OFF mode, produce_archive=False
|
|
||||||
- Image document (PNG)
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf.ocr is NOT called
|
|
||||||
- archive_path is None
|
|
||||||
- text is empty string (images have no text layer)
|
|
||||||
"""
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "off"
|
|
||||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
|
|
||||||
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
assert tesseract_parser.get_text() == ""
|
|
||||||
|
|
||||||
def test_off_image_with_archive_uses_img2pdf_path(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_png_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- OFF mode, produce_archive=True
|
|
||||||
- Image document (PNG)
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
|
|
||||||
- archive_path is set to the returned path
|
|
||||||
- text is empty string
|
|
||||||
"""
|
|
||||||
fake_archive = Path("/tmp/fake-archive.pdf")
|
|
||||||
mock_convert = mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"_convert_image_to_pdfa",
|
|
||||||
return_value=fake_archive,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "off"
|
|
||||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
|
||||||
|
|
||||||
mock_convert.assert_called_once_with(simple_png_file)
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path == fake_archive
|
|
||||||
assert tesseract_parser.get_text() == ""
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestProduceArchiveFalse:
|
|
||||||
"""Verify produce_archive=False never results in an archive regardless of mode."""
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("mode", ["force", "redo"])
|
|
||||||
def test_produce_archive_false_force_redo_modes(
|
|
||||||
self,
|
|
||||||
mode: str,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
multi_page_images_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- FORCE or REDO mode, produce_archive=False
|
|
||||||
- Any PDF
|
|
||||||
WHEN:
|
|
||||||
- parse() is called (ocrmypdf mocked to succeed)
|
|
||||||
THEN:
|
|
||||||
- archive_path is NOT set even though ocrmypdf ran
|
|
||||||
"""
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = mode
|
|
||||||
tesseract_parser.parse(
|
|
||||||
multi_page_images_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
assert tesseract_parser.get_text() is not None
|
|
||||||
|
|
||||||
def test_produce_archive_false_auto_with_text(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
simple_digital_pdf_file: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- AUTO mode, produce_archive=False
|
|
||||||
- PDF with text > VALID_TEXT_LENGTH
|
|
||||||
WHEN:
|
|
||||||
- parse() is called
|
|
||||||
THEN:
|
|
||||||
- ocrmypdf is skipped entirely (early return)
|
|
||||||
- archive_path is None
|
|
||||||
"""
|
|
||||||
mocker.patch.object(
|
|
||||||
tesseract_parser,
|
|
||||||
"extract_text",
|
|
||||||
return_value=_LONG_TEXT,
|
|
||||||
)
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
simple_digital_pdf_file,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
@@ -89,35 +89,15 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
|||||||
WHEN:
|
WHEN:
|
||||||
- OCR parameters are constructed
|
- OCR parameters are constructed
|
||||||
THEN:
|
THEN:
|
||||||
- Configuration from database is utilized (AUTO mode with skip_text=True
|
- Configuration from database is utilized
|
||||||
triggers skip_text; AUTO mode alone does not add any extra flag)
|
|
||||||
"""
|
"""
|
||||||
# AUTO mode with skip_text=True explicitly passed: skip_text is set
|
|
||||||
with override_settings(OCR_MODE="redo"):
|
with override_settings(OCR_MODE="redo"):
|
||||||
instance = ApplicationConfiguration.objects.all().first()
|
instance = ApplicationConfiguration.objects.all().first()
|
||||||
instance.mode = ModeChoices.AUTO
|
instance.mode = ModeChoices.SKIP
|
||||||
instance.save()
|
|
||||||
|
|
||||||
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
|
||||||
input_file="input.pdf",
|
|
||||||
output_file="output.pdf",
|
|
||||||
sidecar_file="sidecar.txt",
|
|
||||||
mime_type="application/pdf",
|
|
||||||
safe_fallback=False,
|
|
||||||
skip_text=True,
|
|
||||||
)
|
|
||||||
self.assertTrue(params["skip_text"])
|
|
||||||
self.assertNotIn("redo_ocr", params)
|
|
||||||
self.assertNotIn("force_ocr", params)
|
|
||||||
|
|
||||||
# AUTO mode alone (no skip_text): no extra OCR flag is set
|
|
||||||
with override_settings(OCR_MODE="redo"):
|
|
||||||
instance = ApplicationConfiguration.objects.all().first()
|
|
||||||
instance.mode = ModeChoices.AUTO
|
|
||||||
instance.save()
|
instance.save()
|
||||||
|
|
||||||
params = self.get_params()
|
params = self.get_params()
|
||||||
self.assertNotIn("skip_text", params)
|
self.assertTrue(params["skip_text"])
|
||||||
self.assertNotIn("redo_ocr", params)
|
self.assertNotIn("redo_ocr", params)
|
||||||
self.assertNotIn("force_ocr", params)
|
self.assertNotIn("force_ocr", params)
|
||||||
|
|
||||||
|
|||||||
@@ -370,26 +370,15 @@ class TestParsePdf:
|
|||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Multi-page digital PDF with sufficient text layer
|
|
||||||
- Default settings (mode=auto, produce_archive=True)
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Archive is created (AUTO mode + text present + produce_archive=True
|
|
||||||
→ PDF/A conversion via skip_text)
|
|
||||||
- Text is extracted
|
|
||||||
"""
|
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
tesseract_samples_dir / "simple-digital.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
)
|
)
|
||||||
assert tesseract_parser.archive_path is not None
|
assert tesseract_parser.archive_path is not None
|
||||||
assert tesseract_parser.archive_path.is_file()
|
assert tesseract_parser.archive_path.is_file()
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
tesseract_parser.get_text().lower(),
|
tesseract_parser.get_text(),
|
||||||
["page 1", "page 2", "page 3"],
|
["This is a test document."],
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_with_form_default(
|
def test_with_form_default(
|
||||||
@@ -408,7 +397,7 @@ class TestParsePdf:
|
|||||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_with_form_redo_no_archive_when_not_requested(
|
def test_with_form_redo_produces_no_archive(
|
||||||
self,
|
self,
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
@@ -417,7 +406,6 @@ class TestParsePdf:
|
|||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "with-form.pdf",
|
tesseract_samples_dir / "with-form.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
produce_archive=False,
|
|
||||||
)
|
)
|
||||||
assert tesseract_parser.archive_path is None
|
assert tesseract_parser.archive_path is None
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
@@ -445,7 +433,7 @@ class TestParsePdf:
|
|||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip"
|
||||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||||
assert tesseract_parser.archive_path is None
|
assert tesseract_parser.archive_path is None
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
@@ -461,7 +449,7 @@ class TestParsePdf:
|
|||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "encrypted.pdf",
|
tesseract_samples_dir / "encrypted.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
@@ -571,7 +559,7 @@ class TestParseMultiPage:
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"mode",
|
"mode",
|
||||||
[
|
[
|
||||||
pytest.param("auto", id="auto"),
|
pytest.param("skip", id="skip"),
|
||||||
pytest.param("redo", id="redo"),
|
pytest.param("redo", id="redo"),
|
||||||
pytest.param("force", id="force"),
|
pytest.param("force", id="force"),
|
||||||
],
|
],
|
||||||
@@ -599,7 +587,7 @@ class TestParseMultiPage:
|
|||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-images.pdf",
|
tesseract_samples_dir / "multi-page-images.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
@@ -747,18 +735,16 @@ class TestSkipArchive:
|
|||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- File with existing text layer
|
- File with existing text layer
|
||||||
- Mode: auto, produce_archive=False
|
- Mode: skip_noarchive
|
||||||
WHEN:
|
WHEN:
|
||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- Text extracted from original; no archive created (text exists +
|
- Text extracted; no archive created
|
||||||
produce_archive=False skips OCRmyPDF entirely)
|
|
||||||
"""
|
"""
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip_noarchive"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
produce_archive=False,
|
|
||||||
)
|
)
|
||||||
assert tesseract_parser.archive_path is None
|
assert tesseract_parser.archive_path is None
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
@@ -774,13 +760,13 @@ class TestSkipArchive:
|
|||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- File with image-only pages (no text layer)
|
- File with image-only pages (no text layer)
|
||||||
- Mode: auto, skip_archive_file: auto
|
- Mode: skip_noarchive
|
||||||
WHEN:
|
WHEN:
|
||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- Text extracted; archive created (OCR needed, no existing text)
|
- Text extracted; archive created (OCR needed)
|
||||||
"""
|
"""
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip_noarchive"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-images.pdf",
|
tesseract_samples_dir / "multi-page-images.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
@@ -792,58 +778,41 @@ class TestSkipArchive:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("produce_archive", "filename", "expect_archive"),
|
("skip_archive_file", "filename", "expect_archive"),
|
||||||
[
|
[
|
||||||
|
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||||
|
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
True,
|
"with_text",
|
||||||
"multi-page-digital.pdf",
|
|
||||||
True,
|
|
||||||
id="produce-archive-with-text",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
True,
|
|
||||||
"multi-page-images.pdf",
|
|
||||||
True,
|
|
||||||
id="produce-archive-no-text",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
False,
|
|
||||||
"multi-page-digital.pdf",
|
"multi-page-digital.pdf",
|
||||||
False,
|
False,
|
||||||
id="no-archive-with-text-layer",
|
id="with-text-layer",
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
False,
|
"with_text",
|
||||||
"multi-page-images.pdf",
|
"multi-page-images.pdf",
|
||||||
False,
|
True,
|
||||||
id="no-archive-no-text-layer",
|
id="with-text-no-layer",
|
||||||
),
|
),
|
||||||
|
pytest.param(
|
||||||
|
"always",
|
||||||
|
"multi-page-digital.pdf",
|
||||||
|
False,
|
||||||
|
id="always-with-text",
|
||||||
|
),
|
||||||
|
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_produce_archive_flag(
|
def test_skip_archive_file_setting(
|
||||||
self,
|
self,
|
||||||
produce_archive: bool, # noqa: FBT001
|
skip_archive_file: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
expect_archive: bool, # noqa: FBT001
|
expect_archive: str,
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
tesseract_parser.settings.skip_archive_file = skip_archive_file
|
||||||
GIVEN:
|
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||||
- Various PDFs (with and without text layers)
|
|
||||||
- produce_archive flag set to True or False
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- archive_path is set if and only if produce_archive=True
|
|
||||||
- Text is always extracted
|
|
||||||
"""
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
tesseract_samples_dir / filename,
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=produce_archive,
|
|
||||||
)
|
|
||||||
text = tesseract_parser.get_text().lower()
|
text = tesseract_parser.get_text().lower()
|
||||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||||
if expect_archive:
|
if expect_archive:
|
||||||
@@ -851,59 +820,6 @@ class TestSkipArchive:
|
|||||||
else:
|
else:
|
||||||
assert tesseract_parser.archive_path is None
|
assert tesseract_parser.archive_path is None
|
||||||
|
|
||||||
def test_tagged_pdf_skips_ocr_in_auto_mode(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
tesseract_samples_dir: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
|
||||||
- Mode: auto, produce_archive=False
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
|
|
||||||
- Text is extracted from the original via pdftotext
|
|
||||||
- No archive is produced
|
|
||||||
"""
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
|
||||||
tesseract_parser.parse(
|
|
||||||
tesseract_samples_dir / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=False,
|
|
||||||
)
|
|
||||||
mock_ocr.assert_not_called()
|
|
||||||
assert tesseract_parser.archive_path is None
|
|
||||||
assert tesseract_parser.get_text()
|
|
||||||
|
|
||||||
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
|
|
||||||
self,
|
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
|
||||||
tesseract_samples_dir: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
|
|
||||||
- Mode: auto, produce_archive=True
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
|
|
||||||
- Archive is produced
|
|
||||||
- Text is preserved from the original
|
|
||||||
"""
|
|
||||||
tesseract_parser.settings.mode = "auto"
|
|
||||||
tesseract_parser.parse(
|
|
||||||
tesseract_samples_dir / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
produce_archive=True,
|
|
||||||
)
|
|
||||||
assert tesseract_parser.archive_path is not None
|
|
||||||
assert tesseract_parser.get_text()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Parse — mixed pages / sidecar
|
# Parse — mixed pages / sidecar
|
||||||
@@ -919,13 +835,13 @@ class TestParseMixed:
|
|||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- File with text in some pages (image) and some pages (digital)
|
- File with text in some pages (image) and some pages (digital)
|
||||||
- Mode: auto (skip_text), skip_archive_file: always
|
- Mode: skip
|
||||||
WHEN:
|
WHEN:
|
||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- All pages extracted; archive created; sidecar notes skipped pages
|
- All pages extracted; archive created; sidecar notes skipped pages
|
||||||
"""
|
"""
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
@@ -982,18 +898,17 @@ class TestParseMixed:
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- File with mixed pages (some with text, some image-only)
|
- File with mixed pages
|
||||||
- Mode: auto, produce_archive=False
|
- Mode: skip_noarchive
|
||||||
WHEN:
|
WHEN:
|
||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- No archive created (produce_archive=False); text from text layer present
|
- No archive created (file has text layer); later-page text present
|
||||||
"""
|
"""
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip_noarchive"
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
produce_archive=False,
|
|
||||||
)
|
)
|
||||||
assert tesseract_parser.archive_path is None
|
assert tesseract_parser.archive_path is None
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
@@ -1008,12 +923,12 @@ class TestParseMixed:
|
|||||||
|
|
||||||
|
|
||||||
class TestParseRotate:
|
class TestParseRotate:
|
||||||
def test_rotate_auto_mode(
|
def test_rotate_skip_mode(
|
||||||
self,
|
self,
|
||||||
tesseract_parser: RasterisedDocumentParser,
|
tesseract_parser: RasterisedDocumentParser,
|
||||||
tesseract_samples_dir: Path,
|
tesseract_samples_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
tesseract_parser.settings.mode = "auto"
|
tesseract_parser.settings.mode = "skip"
|
||||||
tesseract_parser.settings.rotate = True
|
tesseract_parser.settings.rotate = True
|
||||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||||
assert_ordered_substrings(
|
assert_ordered_substrings(
|
||||||
@@ -1040,19 +955,12 @@ class TestParseRtl:
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- PDF with RTL Arabic text in its text layer (short: 18 chars)
|
- PDF with RTL Arabic text
|
||||||
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
|
|
||||||
WHEN:
|
WHEN:
|
||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- Arabic content is extracted from the PDF text layer (normalised for bidi)
|
- Arabic content is extracted (normalised for bidi)
|
||||||
|
|
||||||
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
|
|
||||||
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
|
|
||||||
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
|
|
||||||
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
|
|
||||||
"""
|
"""
|
||||||
tesseract_parser.settings.mode = "off"
|
|
||||||
tesseract_parser.parse(
|
tesseract_parser.parse(
|
||||||
tesseract_samples_dir / "rtl-test.pdf",
|
tesseract_samples_dir / "rtl-test.pdf",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
@@ -1115,11 +1023,11 @@ class TestOcrmypdfParameters:
|
|||||||
assert ("clean" in params) == expected_clean
|
assert ("clean" in params) == expected_clean
|
||||||
assert ("clean_final" in params) == expected_clean_final
|
assert ("clean_final" in params) == expected_clean_final
|
||||||
|
|
||||||
def test_clean_final_auto_mode(
|
def test_clean_final_skip_mode(
|
||||||
self,
|
self,
|
||||||
make_tesseract_parser: MakeTesseractParser,
|
make_tesseract_parser: MakeTesseractParser,
|
||||||
) -> None:
|
) -> None:
|
||||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
|
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||||
assert params["clean_final"] is True
|
assert params["clean_final"] is True
|
||||||
assert "clean" not in params
|
assert "clean" not in params
|
||||||
@@ -1136,9 +1044,9 @@ class TestOcrmypdfParameters:
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
||||||
[
|
[
|
||||||
pytest.param("auto", True, True, id="auto-deskew-on"),
|
pytest.param("skip", True, True, id="skip-deskew-on"),
|
||||||
pytest.param("redo", True, False, id="redo-deskew-off"),
|
pytest.param("redo", True, False, id="redo-deskew-off"),
|
||||||
pytest.param("auto", False, False, id="auto-no-deskew"),
|
pytest.param("skip", False, False, id="skip-no-deskew"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_deskew_option(
|
def test_deskew_option(
|
||||||
|
|||||||
@@ -132,13 +132,13 @@ class TestOcrSettingsChecks:
|
|||||||
pytest.param(
|
pytest.param(
|
||||||
"OCR_MODE",
|
"OCR_MODE",
|
||||||
"skip_noarchive",
|
"skip_noarchive",
|
||||||
'OCR output mode "skip_noarchive"',
|
"deprecated",
|
||||||
id="deprecated-mode-now-invalid",
|
id="deprecated-mode",
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"ARCHIVE_FILE_GENERATION",
|
"OCR_SKIP_ARCHIVE_FILE",
|
||||||
"invalid",
|
"invalid",
|
||||||
'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"',
|
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
|
||||||
id="invalid-skip-archive-file",
|
id="invalid-skip-archive-file",
|
||||||
),
|
),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
|
|||||||
@@ -1,64 +0,0 @@
|
|||||||
"""Tests for v3 system checks: deprecated v2 OCR env var warnings."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from paperless.checks import check_deprecated_v2_ocr_env_vars
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pytest_mock import MockerFixture
|
|
||||||
|
|
||||||
|
|
||||||
class TestDeprecatedV2OcrEnvVarWarnings:
|
|
||||||
def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None:
|
|
||||||
"""No warnings when neither deprecated variable is set."""
|
|
||||||
mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True)
|
|
||||||
result = check_deprecated_v2_ocr_env_vars(None)
|
|
||||||
assert result == []
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("env_var", "env_value", "expected_id", "expected_fragment"),
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
|
||||||
"always",
|
|
||||||
"paperless.W002",
|
|
||||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
|
||||||
id="skip-archive-file-warns",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"PAPERLESS_OCR_MODE",
|
|
||||||
"skip",
|
|
||||||
"paperless.W003",
|
|
||||||
"skip",
|
|
||||||
id="ocr-mode-skip-warns",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"PAPERLESS_OCR_MODE",
|
|
||||||
"skip_noarchive",
|
|
||||||
"paperless.W003",
|
|
||||||
"skip_noarchive",
|
|
||||||
id="ocr-mode-skip-noarchive-warns",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_deprecated_var_produces_one_warning(
|
|
||||||
self,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
env_var: str,
|
|
||||||
env_value: str,
|
|
||||||
expected_id: str,
|
|
||||||
expected_fragment: str,
|
|
||||||
) -> None:
|
|
||||||
"""Each deprecated setting in isolation produces exactly one warning."""
|
|
||||||
mocker.patch.dict(os.environ, {env_var: env_value}, clear=True)
|
|
||||||
result = check_deprecated_v2_ocr_env_vars(None)
|
|
||||||
|
|
||||||
assert len(result) == 1
|
|
||||||
warning = result[0]
|
|
||||||
assert warning.id == expected_id
|
|
||||||
assert expected_fragment in warning.msg
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
"""Tests for OcrConfig archive_file_generation field behavior."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from paperless.config import OcrConfig
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def null_app_config(mocker) -> MagicMock:
|
|
||||||
"""Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
|
|
||||||
return mocker.MagicMock(
|
|
||||||
output_type=None,
|
|
||||||
pages=None,
|
|
||||||
language=None,
|
|
||||||
mode=None,
|
|
||||||
archive_file_generation=None,
|
|
||||||
image_dpi=None,
|
|
||||||
unpaper_clean=None,
|
|
||||||
deskew=None,
|
|
||||||
rotate_pages=None,
|
|
||||||
rotate_pages_threshold=None,
|
|
||||||
max_image_pixels=None,
|
|
||||||
color_conversion_strategy=None,
|
|
||||||
user_args=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def make_ocr_config(mocker, null_app_config):
|
|
||||||
mocker.patch(
|
|
||||||
"paperless.config.BaseConfig._get_config_instance",
|
|
||||||
return_value=null_app_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _make(**django_settings_overrides):
|
|
||||||
with override_settings(**django_settings_overrides):
|
|
||||||
return OcrConfig()
|
|
||||||
|
|
||||||
return _make
|
|
||||||
|
|
||||||
|
|
||||||
class TestOcrConfigArchiveFileGeneration:
|
|
||||||
def test_auto_from_settings(self, make_ocr_config) -> None:
|
|
||||||
cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto")
|
|
||||||
assert cfg.archive_file_generation == "auto"
|
|
||||||
|
|
||||||
def test_always_from_settings(self, make_ocr_config) -> None:
|
|
||||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
|
||||||
assert cfg.archive_file_generation == "always"
|
|
||||||
|
|
||||||
def test_never_from_settings(self, make_ocr_config) -> None:
|
|
||||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never")
|
|
||||||
assert cfg.archive_file_generation == "never"
|
|
||||||
|
|
||||||
def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None:
|
|
||||||
null_app_config.archive_file_generation = "never"
|
|
||||||
cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always")
|
|
||||||
assert cfg.archive_file_generation == "never"
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
"""Tests for paperless.parsers.utils helpers."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from paperless.parsers.utils import is_tagged_pdf
|
|
||||||
|
|
||||||
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
|
||||||
|
|
||||||
|
|
||||||
class TestIsTaggedPdf:
|
|
||||||
def test_tagged_pdf_returns_true(self) -> None:
|
|
||||||
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
|
||||||
|
|
||||||
def test_untagged_pdf_returns_false(self) -> None:
|
|
||||||
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
|
|
||||||
|
|
||||||
def test_nonexistent_path_returns_false(self) -> None:
|
|
||||||
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
|
|
||||||
|
|
||||||
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
|
|
||||||
bad = tmp_path / "bad.pdf"
|
|
||||||
bad.write_bytes(b"not a pdf")
|
|
||||||
assert is_tagged_pdf(bad) is False
|
|
||||||
Reference in New Issue
Block a user