mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-30 04:42:45 +00:00
Compare commits
4 Commits
feature-sh
...
chore/plug
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d0d243057 | ||
|
|
7a192d021f | ||
|
|
1e30490a46 | ||
|
|
bd9e529a63 |
3
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
3
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@@ -21,6 +21,7 @@ body:
|
|||||||
- [The installation instructions](https://docs.paperless-ngx.com/setup/#installation).
|
- [The installation instructions](https://docs.paperless-ngx.com/setup/#installation).
|
||||||
- [Existing issues and discussions](https://github.com/paperless-ngx/paperless-ngx/search?q=&type=issues).
|
- [Existing issues and discussions](https://github.com/paperless-ngx/paperless-ngx/search?q=&type=issues).
|
||||||
- Disable any custom container initialization scripts, if using
|
- Disable any custom container initialization scripts, if using
|
||||||
|
- Remove any third-party parser plugins — issues caused by or requiring changes to a third-party plugin will be closed without investigation.
|
||||||
|
|
||||||
If you encounter issues while installing or configuring Paperless-ngx, please post in the ["Support" section of the discussions](https://github.com/paperless-ngx/paperless-ngx/discussions/new?category=support).
|
If you encounter issues while installing or configuring Paperless-ngx, please post in the ["Support" section of the discussions](https://github.com/paperless-ngx/paperless-ngx/discussions/new?category=support).
|
||||||
- type: textarea
|
- type: textarea
|
||||||
@@ -120,5 +121,7 @@ body:
|
|||||||
required: true
|
required: true
|
||||||
- label: I have already searched for relevant existing issues and discussions before opening this report.
|
- label: I have already searched for relevant existing issues and discussions before opening this report.
|
||||||
required: true
|
required: true
|
||||||
|
- label: I have reproduced this issue with all third-party parser plugins removed. I understand that issues caused by third-party plugins will be closed without investigation.
|
||||||
|
required: true
|
||||||
- label: I have updated the title field above with a concise description.
|
- label: I have updated the title field above with a concise description.
|
||||||
required: true
|
required: true
|
||||||
|
|||||||
@@ -723,6 +723,81 @@ services:
|
|||||||
|
|
||||||
1. Note the `:ro` tag means the folder will be mounted as read only. This is for extra security against changes
|
1. Note the `:ro` tag means the folder will be mounted as read only. This is for extra security against changes
|
||||||
|
|
||||||
|
## Installing third-party parser plugins {#parser-plugins}
|
||||||
|
|
||||||
|
Third-party parser plugins extend Paperless-ngx to support additional file
|
||||||
|
formats. A plugin is a Python package that advertises itself under the
|
||||||
|
`paperless_ngx.parsers` entry point group. Refer to the
|
||||||
|
[developer documentation](development.md#making-custom-parsers) for how to
|
||||||
|
create one.
|
||||||
|
|
||||||
|
!!! warning "Third-party plugins are not officially supported"
|
||||||
|
|
||||||
|
The Paperless-ngx maintainers do not provide support for third-party
|
||||||
|
plugins. Issues caused by or requiring changes to a third-party plugin
|
||||||
|
will be closed without further investigation. Always reproduce problems
|
||||||
|
with all plugins removed before filing a bug report.
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
Use a [custom container initialization script](#custom-container-initialization)
|
||||||
|
to install the package before the webserver starts. Create a shell script and
|
||||||
|
mount it into `/custom-cont-init.d`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# /path/to/my/scripts/install-parsers.sh
|
||||||
|
|
||||||
|
pip install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
Mount it in your `docker-compose.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
webserver:
|
||||||
|
# ...
|
||||||
|
volumes:
|
||||||
|
- /path/to/my/scripts:/custom-cont-init.d:ro
|
||||||
|
```
|
||||||
|
|
||||||
|
The script runs as `root` before the webserver starts, so the package will be
|
||||||
|
available when Paperless-ngx discovers plugins at startup.
|
||||||
|
|
||||||
|
### Bare metal
|
||||||
|
|
||||||
|
Install the package into the same Python environment that runs Paperless-ngx.
|
||||||
|
If you followed the standard bare-metal install guide, that is the `paperless`
|
||||||
|
user's environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo -Hu paperless pip3 install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using `uv` or a virtual environment, activate it first and then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install my-paperless-parser-package
|
||||||
|
# or
|
||||||
|
pip install my-paperless-parser-package
|
||||||
|
```
|
||||||
|
|
||||||
|
Restart all Paperless-ngx services after installation so the new plugin is
|
||||||
|
discovered.
|
||||||
|
|
||||||
|
### Verifying installation
|
||||||
|
|
||||||
|
On the next startup, check the application logs for a line confirming
|
||||||
|
discovery:
|
||||||
|
|
||||||
|
```
|
||||||
|
Loaded third-party parser 'My Parser' v1.0.0 by Acme Corp (entrypoint: 'my_parser').
|
||||||
|
```
|
||||||
|
|
||||||
|
If this line does not appear, verify that the package is installed in the
|
||||||
|
correct environment and that its `pyproject.toml` declares the
|
||||||
|
`paperless_ngx.parsers` entry point.
|
||||||
|
|
||||||
## MySQL Caveats {#mysql-caveats}
|
## MySQL Caveats {#mysql-caveats}
|
||||||
|
|
||||||
### Case Sensitivity
|
### Case Sensitivity
|
||||||
|
|||||||
@@ -370,121 +370,363 @@ docker build --file Dockerfile --tag paperless:local .
|
|||||||
|
|
||||||
## Extending Paperless-ngx
|
## Extending Paperless-ngx
|
||||||
|
|
||||||
Paperless-ngx does not have any fancy plugin systems and will probably never
|
Paperless-ngx supports third-party document parsers via a Python entry point
|
||||||
have. However, some parts of the application have been designed to allow
|
plugin system. Plugins are distributed as ordinary Python packages and
|
||||||
easy integration of additional features without any modification to the
|
discovered automatically at startup — no changes to the Paperless-ngx source
|
||||||
base code.
|
are required.
|
||||||
|
|
||||||
|
!!! warning "Third-party plugins are not officially supported"
|
||||||
|
|
||||||
|
The Paperless-ngx maintainers do not provide support for third-party
|
||||||
|
plugins. Issues that are caused by or require changes to a third-party
|
||||||
|
plugin will be closed without further investigation. If you believe you
|
||||||
|
have found a bug in Paperless-ngx itself (not in a plugin), please
|
||||||
|
reproduce it with all third-party plugins removed before filing an issue.
|
||||||
|
|
||||||
### Making custom parsers
|
### Making custom parsers
|
||||||
|
|
||||||
Paperless-ngx uses parsers to add documents. A parser is
|
Paperless-ngx uses parsers to add documents. A parser is responsible for:
|
||||||
responsible for:
|
|
||||||
|
|
||||||
- Retrieving the content from the original
|
- Extracting plain-text content from the document
|
||||||
- Creating a thumbnail
|
- Generating a thumbnail image
|
||||||
- _optional:_ Retrieving a created date from the original
|
- _optional:_ Detecting the document's creation date
|
||||||
- _optional:_ Creating an archived document from the original
|
- _optional:_ Producing a searchable PDF archive copy
|
||||||
|
|
||||||
Custom parsers can be added to Paperless-ngx to support more file types. In
|
Custom parsers are distributed as ordinary Python packages and registered
|
||||||
order to do that, you need to write the parser itself and announce its
|
via a [setuptools entry point](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||||
existence to Paperless-ngx.
|
No changes to the Paperless-ngx source are required.
|
||||||
|
|
||||||
The parser itself must extend `documents.parsers.DocumentParser` and
|
#### 1. Implementing the parser class
|
||||||
must implement the methods `parse` and `get_thumbnail`. You can provide
|
|
||||||
your own implementation to `get_date` if you don't want to rely on
|
Your parser must satisfy the `ParserProtocol` structural interface defined in
|
||||||
Paperless-ngx' default date guessing mechanisms.
|
`paperless.parsers`. The simplest approach is to write a plain class — no base
|
||||||
|
class is required, only the right attributes and methods.
|
||||||
|
|
||||||
|
**Class-level identity attributes**
|
||||||
|
|
||||||
|
The registry reads these before instantiating the parser, so they must be
|
||||||
|
plain class attributes (not instance attributes or properties):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class MyCustomParser(DocumentParser):
|
class MyCustomParser:
|
||||||
|
name = "My Format Parser" # human-readable name shown in logs
|
||||||
def parse(self, document_path, mime_type):
|
version = "1.0.0" # semantic version string
|
||||||
# This method does not return anything. Rather, you should assign
|
author = "Acme Corp" # author / organisation
|
||||||
# whatever you got from the document to the following fields:
|
url = "https://example.com/my-parser" # docs or issue tracker
|
||||||
|
|
||||||
# The content of the document.
|
|
||||||
self.text = "content"
|
|
||||||
|
|
||||||
# Optional: path to a PDF document that you created from the original.
|
|
||||||
self.archive_path = os.path.join(self.tempdir, "archived.pdf")
|
|
||||||
|
|
||||||
# Optional: "created" date of the document.
|
|
||||||
self.date = get_created_from_metadata(document_path)
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type):
|
|
||||||
# This should return the path to a thumbnail you created for this
|
|
||||||
# document.
|
|
||||||
return os.path.join(self.tempdir, "thumb.webp")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If you encounter any issues during parsing, raise a
|
**Declaring supported MIME types**
|
||||||
`documents.parsers.ParseError`.
|
|
||||||
|
|
||||||
The `self.tempdir` directory is a temporary directory that is guaranteed
|
Return a `dict` mapping MIME type strings to preferred file extensions
|
||||||
to be empty and removed after consumption finished. You can use that
|
(including the leading dot). Paperless-ngx uses the extension when storing
|
||||||
directory to store any intermediate files and also use it to store the
|
archive copies and serving files for download.
|
||||||
thumbnail / archived document.
|
|
||||||
|
|
||||||
After that, you need to announce your parser to Paperless-ngx. You need to
|
|
||||||
connect a handler to the `document_consumer_declaration` signal. Have a
|
|
||||||
look in the file `src/paperless_tesseract/apps.py` on how that's done.
|
|
||||||
The handler is a method that returns information about your parser:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def myparser_consumer_declaration(sender, **kwargs):
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"parser": MyCustomParser,
|
"application/x-my-format": ".myf",
|
||||||
"weight": 0,
|
"application/x-my-format-alt": ".myf",
|
||||||
"mime_types": {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- `parser` is a reference to a class that extends `DocumentParser`.
|
**Scoring**
|
||||||
- `weight` is used whenever two or more parsers are able to parse a
|
|
||||||
file: The parser with the higher weight wins. This can be used to
|
|
||||||
override the parsers provided by Paperless-ngx.
|
|
||||||
- `mime_types` is a dictionary. The keys are the mime types your
|
|
||||||
parser supports and the value is the default file extension that
|
|
||||||
Paperless-ngx should use when storing files and serving them for
|
|
||||||
download. We could guess that from the file extensions, but some
|
|
||||||
mime types have many extensions associated with them and the Python
|
|
||||||
methods responsible for guessing the extension do not always return
|
|
||||||
the same value.
|
|
||||||
|
|
||||||
## Using Visual Studio Code devcontainer
|
When more than one parser can handle a file, the registry calls `score()` on
|
||||||
|
each candidate and picks the one with the highest result. Return `None` to
|
||||||
|
decline handling a file even though the MIME type is listed as supported (for
|
||||||
|
example, when a required external service is not configured).
|
||||||
|
|
||||||
Another easy way to get started with development is to use Visual Studio
|
| Score | Meaning |
|
||||||
Code devcontainers. This approach will create a preconfigured development
|
| ------ | ------------------------------------------------- |
|
||||||
environment with all of the required tools and dependencies.
|
| `None` | Decline — do not handle this file |
|
||||||
[Learn more about devcontainers](https://code.visualstudio.com/docs/devcontainers/containers).
|
| `10` | Default priority used by all built-in parsers |
|
||||||
The .devcontainer/vscode/tasks.json and .devcontainer/vscode/launch.json files
|
| `> 10` | Override a built-in parser for the same MIME type |
|
||||||
contain more information about the specific tasks and launch configurations (see the
|
|
||||||
non-standard "description" field).
|
|
||||||
|
|
||||||
To get started:
|
```python
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: "Path | None" = None,
|
||||||
|
) -> int | None:
|
||||||
|
# Inspect filename or file bytes here if needed.
|
||||||
|
return 10
|
||||||
|
```
|
||||||
|
|
||||||
1. Clone the repository on your machine and open the Paperless-ngx folder in VS Code.
|
**Archive and rendition flags**
|
||||||
|
|
||||||
2. VS Code will prompt you with "Reopen in container". Do so and wait for the environment to start.
|
```python
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""True if parse() can produce a searchable PDF archive copy."""
|
||||||
|
return True # or False if your parser doesn't produce PDFs
|
||||||
|
|
||||||
3. In case your host operating system is Windows:
|
@property
|
||||||
- The Source Control view in Visual Studio Code might show: "The detected Git repository is potentially unsafe as the folder is owned by someone other than the current user." Use "Manage Unsafe Repositories" to fix this.
|
def requires_pdf_rendition(self) -> bool:
|
||||||
- Git might have detecteded modifications for all files, because Windows is using CRLF line endings. Run `git checkout .` in the containers terminal to fix this issue.
|
"""True if the original format cannot be displayed by a browser
|
||||||
|
(e.g. DOCX, ODT) and the PDF output must always be kept."""
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
4. Initialize the project by running the task **Project Setup: Run all Init Tasks**. This
|
**Context manager — temp directory lifecycle**
|
||||||
will initialize the database tables and create a superuser. Then you can compile the front end
|
|
||||||
for production or run the frontend in debug mode.
|
|
||||||
|
|
||||||
5. The project is ready for debugging, start either run the fullstack debug or individual debug
|
Paperless-ngx always uses parsers as context managers. Create a temporary
|
||||||
processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
|
working directory in `__enter__` (or `__init__`) and remove it in `__exit__`
|
||||||
|
regardless of whether an exception occurred. Store intermediate files,
|
||||||
|
thumbnails, and archive PDFs inside this directory.
|
||||||
|
|
||||||
## Developing Date Parser Plugins
|
```python
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
class MyCustomParser:
|
||||||
|
...
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optional context — `configure()`**
|
||||||
|
|
||||||
|
The consumer calls `configure()` with a `ParserContext` after instantiation
|
||||||
|
and before `parse()`. If your parser doesn't need context, a no-op
|
||||||
|
implementation is fine:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass # override if you need context.mailrule_id, etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parsing**
|
||||||
|
|
||||||
|
`parse()` is the core method. It must not return a value; instead, store
|
||||||
|
results in instance attributes and expose them via the accessor methods below.
|
||||||
|
Raise `documents.parsers.ParseError` on any unrecoverable failure.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
self._text = extract_text_from_my_format(document_path)
|
||||||
|
except Exception as e:
|
||||||
|
raise ParseError(f"Failed to parse {document_path}: {e}") from e
|
||||||
|
|
||||||
|
if produce_archive and self.can_produce_archive:
|
||||||
|
archive = self._tempdir / "archived.pdf"
|
||||||
|
convert_to_pdf(document_path, archive)
|
||||||
|
self._archive_path = archive
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result accessors**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> "datetime.datetime | None":
|
||||||
|
# Return a datetime extracted from the document, or None to let
|
||||||
|
# Paperless-ngx use its default date-guessing logic.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return self._archive_path
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thumbnail**
|
||||||
|
|
||||||
|
`get_thumbnail()` may be called independently of `parse()`. Return the path
|
||||||
|
to a WebP image inside `self._tempdir`. The image should be roughly 500 × 700
|
||||||
|
pixels.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
thumb = self._tempdir / "thumb.webp"
|
||||||
|
render_thumbnail(document_path, thumb)
|
||||||
|
return thumb
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optional methods**
|
||||||
|
|
||||||
|
These are called by the API on demand, not during the consumption pipeline.
|
||||||
|
Implement them if your format supports the information; otherwise return
|
||||||
|
`None` / `[]`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
return count_pages(document_path)
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> "list[MetadataEntry]":
|
||||||
|
# Must never raise. Return [] if metadata cannot be read.
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
return [
|
||||||
|
MetadataEntry(
|
||||||
|
namespace="https://example.com/ns/",
|
||||||
|
prefix="ex",
|
||||||
|
key="Author",
|
||||||
|
value="Alice",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Registering via entry point
|
||||||
|
|
||||||
|
Add the following to your package's `pyproject.toml`. The key (left of `=`)
|
||||||
|
is an arbitrary name used only in log output; the value is the
|
||||||
|
`module:ClassName` import path.
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[project.entry-points."paperless_ngx.parsers"]
|
||||||
|
my_parser = "my_package.parsers:MyCustomParser"
|
||||||
|
```
|
||||||
|
|
||||||
|
Install your package into the same Python environment as Paperless-ngx (or
|
||||||
|
add it to the Docker image), and the parser will be discovered automatically
|
||||||
|
on the next startup. No configuration changes are needed.
|
||||||
|
|
||||||
|
To verify discovery, check the application logs at startup for a line like:
|
||||||
|
|
||||||
|
```
|
||||||
|
Loaded third-party parser 'My Format Parser' v1.0.0 by Acme Corp (entrypoint: 'my_parser').
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Utilities
|
||||||
|
|
||||||
|
`paperless.parsers.utils` provides helpers you can import directly:
|
||||||
|
|
||||||
|
| Function | Description |
|
||||||
|
| --------------------------------------- | ---------------------------------------------------------------- |
|
||||||
|
| `read_file_handle_unicode_errors(path)` | Read a file as UTF-8, replacing invalid bytes instead of raising |
|
||||||
|
| `get_page_count_for_pdf(path)` | Count pages in a PDF using pikepdf |
|
||||||
|
| `extract_pdf_metadata(path)` | Extract XMP metadata from a PDF as a `list[MetadataEntry]` |
|
||||||
|
|
||||||
|
#### Minimal example
|
||||||
|
|
||||||
|
A complete, working parser for a hypothetical plain-XML format:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
from types import TracebackType
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
|
||||||
|
class XmlDocumentParser:
|
||||||
|
name = "XML Parser"
|
||||||
|
version = "1.0.0"
|
||||||
|
author = "Acme Corp"
|
||||||
|
url = "https://example.com/xml-parser"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
return {"application/xml": ".xml", "text/xml": ".xml"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type: str, filename: str, path: Path | None = None) -> int | None:
|
||||||
|
return 10
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR))
|
||||||
|
self._text: str | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse(self, document_path: Path, mime_type: str, *, produce_archive: bool = True) -> None:
|
||||||
|
try:
|
||||||
|
tree = ET.parse(document_path)
|
||||||
|
self._text = " ".join(tree.getroot().itertext())
|
||||||
|
except ET.ParseError as e:
|
||||||
|
raise ParseError(f"XML parse error: {e}") from e
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
img = Image.new("RGB", (500, 700), color="white")
|
||||||
|
ImageDraw.Draw(img).text((10, 10), "XML Document", fill="black")
|
||||||
|
out = self._tempdir / "thumb.webp"
|
||||||
|
img.save(out, format="WEBP")
|
||||||
|
return out
|
||||||
|
|
||||||
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path: Path, mime_type: str) -> list:
|
||||||
|
return []
|
||||||
|
```
|
||||||
|
|
||||||
|
### Developing date parser plugins
|
||||||
|
|
||||||
Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||||
|
|
||||||
### Creating a Date Parser Plugin
|
#### Creating a Date Parser Plugin
|
||||||
|
|
||||||
To create a custom date parser plugin, you need to:
|
To create a custom date parser plugin, you need to:
|
||||||
|
|
||||||
@@ -492,7 +734,7 @@ To create a custom date parser plugin, you need to:
|
|||||||
2. Implement the required abstract method
|
2. Implement the required abstract method
|
||||||
3. Register your plugin via an entry point
|
3. Register your plugin via an entry point
|
||||||
|
|
||||||
#### 1. Implementing the Parser Class
|
##### 1. Implementing the Parser Class
|
||||||
|
|
||||||
Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
|
Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
|
||||||
|
|
||||||
@@ -532,7 +774,7 @@ class MyDateParserPlugin(DateParserPluginBase):
|
|||||||
yield another_datetime
|
yield another_datetime
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Configuration and Helper Methods
|
##### 2. Configuration and Helper Methods
|
||||||
|
|
||||||
Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
|
Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
|
||||||
|
|
||||||
@@ -565,11 +807,11 @@ def _filter_date(
|
|||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Resource Management (Optional)
|
##### 3. Resource Management (Optional)
|
||||||
|
|
||||||
If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
|
If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
|
||||||
|
|
||||||
#### 4. Registering Your Plugin
|
##### 4. Registering Your Plugin
|
||||||
|
|
||||||
Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
|
Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
|
||||||
|
|
||||||
@@ -580,7 +822,7 @@ my_parser = "my_package.parsers:MyDateParserPlugin"
|
|||||||
|
|
||||||
The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
|
The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
|
||||||
|
|
||||||
### Plugin Discovery
|
#### Plugin Discovery
|
||||||
|
|
||||||
Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
|
Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
|
||||||
|
|
||||||
@@ -591,7 +833,7 @@ Paperless-ngx automatically discovers and loads date parser plugins at runtime.
|
|||||||
|
|
||||||
If multiple plugins are installed, a warning is logged indicating which plugin was selected.
|
If multiple plugins are installed, a warning is logged indicating which plugin was selected.
|
||||||
|
|
||||||
### Example: Simple Date Parser
|
#### Example: Simple Date Parser
|
||||||
|
|
||||||
Here's a minimal example that only looks for ISO 8601 dates:
|
Here's a minimal example that only looks for ISO 8601 dates:
|
||||||
|
|
||||||
@@ -623,3 +865,30 @@ class ISODateParserPlugin(DateParserPluginBase):
|
|||||||
if filtered_date is not None:
|
if filtered_date is not None:
|
||||||
yield filtered_date
|
yield filtered_date
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Using Visual Studio Code devcontainer
|
||||||
|
|
||||||
|
Another easy way to get started with development is to use Visual Studio
|
||||||
|
Code devcontainers. This approach will create a preconfigured development
|
||||||
|
environment with all of the required tools and dependencies.
|
||||||
|
[Learn more about devcontainers](https://code.visualstudio.com/docs/devcontainers/containers).
|
||||||
|
The .devcontainer/vscode/tasks.json and .devcontainer/vscode/launch.json files
|
||||||
|
contain more information about the specific tasks and launch configurations (see the
|
||||||
|
non-standard "description" field).
|
||||||
|
|
||||||
|
To get started:
|
||||||
|
|
||||||
|
1. Clone the repository on your machine and open the Paperless-ngx folder in VS Code.
|
||||||
|
|
||||||
|
2. VS Code will prompt you with "Reopen in container". Do so and wait for the environment to start.
|
||||||
|
|
||||||
|
3. In case your host operating system is Windows:
|
||||||
|
- The Source Control view in Visual Studio Code might show: "The detected Git repository is potentially unsafe as the folder is owned by someone other than the current user." Use "Manage Unsafe Repositories" to fix this.
|
||||||
|
- Git might have detecteded modifications for all files, because Windows is using CRLF line endings. Run `git checkout .` in the containers terminal to fix this issue.
|
||||||
|
|
||||||
|
4. Initialize the project by running the task **Project Setup: Run all Init Tasks**. This
|
||||||
|
will initialize the database tables and create a superuser. Then you can compile the front end
|
||||||
|
for production or run the frontend in debug mode.
|
||||||
|
|
||||||
|
5. The project is ready for debugging, start either run the fullstack debug or individual debug
|
||||||
|
processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
|
||||||
|
|||||||
@@ -45,8 +45,6 @@ from documents.models import DocumentType
|
|||||||
from documents.models import Note
|
from documents.models import Note
|
||||||
from documents.models import SavedView
|
from documents.models import SavedView
|
||||||
from documents.models import SavedViewFilterRule
|
from documents.models import SavedViewFilterRule
|
||||||
from documents.models import ShareLink
|
|
||||||
from documents.models import ShareLinkBundle
|
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import UiSettings
|
from documents.models import UiSettings
|
||||||
@@ -57,7 +55,6 @@ from documents.models import WorkflowActionWebhook
|
|||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||||
from documents.settings import EXPORTER_FILE_NAME
|
from documents.settings import EXPORTER_FILE_NAME
|
||||||
from documents.settings import EXPORTER_SHARE_LINK_BUNDLE_NAME
|
|
||||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||||
from documents.utils import compute_checksum
|
from documents.utils import compute_checksum
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
@@ -388,12 +385,10 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
"workflow_webhook_actions": WorkflowActionWebhook.objects.all(),
|
"workflow_webhook_actions": WorkflowActionWebhook.objects.all(),
|
||||||
"workflows": Workflow.objects.all(),
|
"workflows": Workflow.objects.all(),
|
||||||
"custom_fields": CustomField.objects.all(),
|
"custom_fields": CustomField.objects.all(),
|
||||||
"custom_field_instances": CustomFieldInstance.global_objects.all(),
|
"custom_field_instances": CustomFieldInstance.objects.all(),
|
||||||
"app_configs": ApplicationConfiguration.objects.all(),
|
"app_configs": ApplicationConfiguration.objects.all(),
|
||||||
"notes": Note.global_objects.all(),
|
"notes": Note.objects.all(),
|
||||||
"documents": Document.global_objects.order_by("id").all(),
|
"documents": Document.objects.order_by("id").all(),
|
||||||
"share_links": ShareLink.global_objects.all(),
|
|
||||||
"share_link_bundles": ShareLinkBundle.objects.order_by("id").all(),
|
|
||||||
"social_accounts": SocialAccount.objects.all(),
|
"social_accounts": SocialAccount.objects.all(),
|
||||||
"social_apps": SocialApp.objects.all(),
|
"social_apps": SocialApp.objects.all(),
|
||||||
"social_tokens": SocialToken.objects.all(),
|
"social_tokens": SocialToken.objects.all(),
|
||||||
@@ -414,7 +409,6 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
)
|
)
|
||||||
|
|
||||||
document_manifest: list[dict] = []
|
document_manifest: list[dict] = []
|
||||||
share_link_bundle_manifest: list[dict] = []
|
|
||||||
manifest_path = (self.target / "manifest.json").resolve()
|
manifest_path = (self.target / "manifest.json").resolve()
|
||||||
|
|
||||||
with StreamingManifestWriter(
|
with StreamingManifestWriter(
|
||||||
@@ -433,15 +427,6 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
for record in batch:
|
for record in batch:
|
||||||
self._encrypt_record_inline(record)
|
self._encrypt_record_inline(record)
|
||||||
document_manifest.extend(batch)
|
document_manifest.extend(batch)
|
||||||
elif key == "share_link_bundles":
|
|
||||||
# Accumulate for file-copy loop; written to manifest after
|
|
||||||
for batch in serialize_queryset_batched(
|
|
||||||
qs,
|
|
||||||
batch_size=self.batch_size,
|
|
||||||
):
|
|
||||||
for record in batch:
|
|
||||||
self._encrypt_record_inline(record)
|
|
||||||
share_link_bundle_manifest.extend(batch)
|
|
||||||
elif self.split_manifest and key in (
|
elif self.split_manifest and key in (
|
||||||
"notes",
|
"notes",
|
||||||
"custom_field_instances",
|
"custom_field_instances",
|
||||||
@@ -458,13 +443,7 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
writer.write_batch(batch)
|
writer.write_batch(batch)
|
||||||
|
|
||||||
document_map: dict[int, Document] = {
|
document_map: dict[int, Document] = {
|
||||||
d.pk: d for d in Document.global_objects.order_by("id")
|
d.pk: d for d in Document.objects.order_by("id")
|
||||||
}
|
|
||||||
share_link_bundle_map: dict[int, ShareLinkBundle] = {
|
|
||||||
b.pk: b
|
|
||||||
for b in ShareLinkBundle.objects.order_by("id").prefetch_related(
|
|
||||||
"documents",
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# 3. Export files from each document
|
# 3. Export files from each document
|
||||||
@@ -499,19 +478,6 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
else:
|
else:
|
||||||
writer.write_record(document_dict)
|
writer.write_record(document_dict)
|
||||||
|
|
||||||
for bundle_dict in share_link_bundle_manifest:
|
|
||||||
bundle = share_link_bundle_map[bundle_dict["pk"]]
|
|
||||||
|
|
||||||
bundle_target = self.generate_share_link_bundle_target(
|
|
||||||
bundle,
|
|
||||||
bundle_dict,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not self.data_only and bundle_target is not None:
|
|
||||||
self.copy_share_link_bundle_file(bundle, bundle_target)
|
|
||||||
|
|
||||||
writer.write_record(bundle_dict)
|
|
||||||
|
|
||||||
# 4.2 write version information to target folder
|
# 4.2 write version information to target folder
|
||||||
extra_metadata_path = (self.target / "metadata.json").resolve()
|
extra_metadata_path = (self.target / "metadata.json").resolve()
|
||||||
metadata: dict[str, str | int | dict[str, str | int]] = {
|
metadata: dict[str, str | int | dict[str, str | int]] = {
|
||||||
@@ -632,47 +598,6 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
archive_target,
|
archive_target,
|
||||||
)
|
)
|
||||||
|
|
||||||
def generate_share_link_bundle_target(
|
|
||||||
self,
|
|
||||||
bundle: ShareLinkBundle,
|
|
||||||
bundle_dict: dict,
|
|
||||||
) -> Path | None:
|
|
||||||
"""
|
|
||||||
Generates the export target for a share link bundle file, when present.
|
|
||||||
"""
|
|
||||||
if not bundle.file_path:
|
|
||||||
return None
|
|
||||||
|
|
||||||
bundle_name = Path(bundle.file_path)
|
|
||||||
if bundle_name.is_absolute():
|
|
||||||
bundle_name = Path(bundle_name.name)
|
|
||||||
|
|
||||||
bundle_name = Path("share_link_bundles") / bundle_name
|
|
||||||
bundle_target = (self.target / bundle_name).resolve()
|
|
||||||
bundle_dict["fields"]["file_path"] = str(
|
|
||||||
bundle_name.relative_to("share_link_bundles"),
|
|
||||||
)
|
|
||||||
bundle_dict[EXPORTER_SHARE_LINK_BUNDLE_NAME] = str(bundle_name)
|
|
||||||
return bundle_target
|
|
||||||
|
|
||||||
def copy_share_link_bundle_file(
|
|
||||||
self,
|
|
||||||
bundle: ShareLinkBundle,
|
|
||||||
bundle_target: Path,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Copies a share link bundle ZIP into the export directory.
|
|
||||||
"""
|
|
||||||
bundle_source_path = bundle.absolute_file_path
|
|
||||||
if bundle_source_path is None:
|
|
||||||
raise FileNotFoundError(f"Share link bundle {bundle.pk} has no file path")
|
|
||||||
|
|
||||||
self.check_and_copy(
|
|
||||||
bundle_source_path,
|
|
||||||
None,
|
|
||||||
bundle_target,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _encrypt_record_inline(self, record: dict) -> None:
|
def _encrypt_record_inline(self, record: dict) -> None:
|
||||||
"""Encrypt sensitive fields in a single record, if passphrase is set."""
|
"""Encrypt sensitive fields in a single record, if passphrase is set."""
|
||||||
if not self.passphrase:
|
if not self.passphrase:
|
||||||
@@ -694,15 +619,12 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
"""Write per-document manifest file for --split-manifest mode."""
|
"""Write per-document manifest file for --split-manifest mode."""
|
||||||
content = [document_dict]
|
content = [document_dict]
|
||||||
content.extend(
|
content.extend(
|
||||||
serializers.serialize(
|
serializers.serialize("python", Note.objects.filter(document=document)),
|
||||||
"python",
|
|
||||||
Note.global_objects.filter(document=document),
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
content.extend(
|
content.extend(
|
||||||
serializers.serialize(
|
serializers.serialize(
|
||||||
"python",
|
"python",
|
||||||
CustomFieldInstance.global_objects.filter(document=document),
|
CustomFieldInstance.objects.filter(document=document),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||||
|
|||||||
@@ -32,12 +32,10 @@ from documents.models import CustomFieldInstance
|
|||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import Note
|
from documents.models import Note
|
||||||
from documents.models import ShareLinkBundle
|
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||||
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
|
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
|
||||||
from documents.settings import EXPORTER_FILE_NAME
|
from documents.settings import EXPORTER_FILE_NAME
|
||||||
from documents.settings import EXPORTER_SHARE_LINK_BUNDLE_NAME
|
|
||||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||||
from documents.signals.handlers import check_paths_and_prune_custom_fields
|
from documents.signals.handlers import check_paths_and_prune_custom_fields
|
||||||
from documents.signals.handlers import update_filename_and_move_files
|
from documents.signals.handlers import update_filename_and_move_files
|
||||||
@@ -127,7 +125,7 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
"Found existing user(s), this might indicate a non-empty installation",
|
"Found existing user(s), this might indicate a non-empty installation",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
if Document.global_objects.count() != 0:
|
if Document.objects.count() != 0:
|
||||||
self.stdout.write(
|
self.stdout.write(
|
||||||
self.style.WARNING(
|
self.style.WARNING(
|
||||||
"Found existing documents(s), this might indicate a non-empty installation",
|
"Found existing documents(s), this might indicate a non-empty installation",
|
||||||
@@ -350,42 +348,18 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
f"Failed to read from archive file {doc_archive_path}",
|
f"Failed to read from archive file {doc_archive_path}",
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def check_share_link_bundle_validity(bundle_record: dict) -> None:
|
|
||||||
if EXPORTER_SHARE_LINK_BUNDLE_NAME not in bundle_record:
|
|
||||||
return
|
|
||||||
|
|
||||||
bundle_file = bundle_record[EXPORTER_SHARE_LINK_BUNDLE_NAME]
|
|
||||||
bundle_path: Path = self.source / bundle_file
|
|
||||||
if not bundle_path.exists():
|
|
||||||
raise CommandError(
|
|
||||||
f'The manifest file refers to "{bundle_file}" which does not '
|
|
||||||
"appear to be in the source directory.",
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
with bundle_path.open(mode="rb"):
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
raise CommandError(
|
|
||||||
f"Failed to read from share link bundle file {bundle_path}",
|
|
||||||
) from e
|
|
||||||
|
|
||||||
self.stdout.write("Checking the manifest")
|
self.stdout.write("Checking the manifest")
|
||||||
for manifest_path in self.manifest_paths:
|
for manifest_path in self.manifest_paths:
|
||||||
for record in iter_manifest_records(manifest_path):
|
for record in iter_manifest_records(manifest_path):
|
||||||
# Only check if the document files exist if this is not data only
|
# Only check if the document files exist if this is not data only
|
||||||
# We don't care about documents for a data only import
|
# We don't care about documents for a data only import
|
||||||
if self.data_only:
|
if not self.data_only and record["model"] == "documents.document":
|
||||||
continue
|
|
||||||
if record["model"] == "documents.document":
|
|
||||||
check_document_validity(record)
|
check_document_validity(record)
|
||||||
elif record["model"] == "documents.sharelinkbundle":
|
|
||||||
check_share_link_bundle_validity(record)
|
|
||||||
|
|
||||||
def _import_files_from_manifest(self) -> None:
|
def _import_files_from_manifest(self) -> None:
|
||||||
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
settings.SHARE_LINK_BUNDLE_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
self.stdout.write("Copy files into paperless...")
|
self.stdout.write("Copy files into paperless...")
|
||||||
|
|
||||||
@@ -400,21 +374,9 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
for record in iter_manifest_records(manifest_path)
|
for record in iter_manifest_records(manifest_path)
|
||||||
if record["model"] == "documents.document"
|
if record["model"] == "documents.document"
|
||||||
]
|
]
|
||||||
share_link_bundle_records = [
|
|
||||||
{
|
|
||||||
"pk": record["pk"],
|
|
||||||
EXPORTER_SHARE_LINK_BUNDLE_NAME: record.get(
|
|
||||||
EXPORTER_SHARE_LINK_BUNDLE_NAME,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
for manifest_path in self.manifest_paths
|
|
||||||
for record in iter_manifest_records(manifest_path)
|
|
||||||
if record["model"] == "documents.sharelinkbundle"
|
|
||||||
and record.get(EXPORTER_SHARE_LINK_BUNDLE_NAME)
|
|
||||||
]
|
|
||||||
|
|
||||||
for record in self.track(document_records, description="Copying files..."):
|
for record in self.track(document_records, description="Copying files..."):
|
||||||
document = Document.global_objects.get(pk=record["pk"])
|
document = Document.objects.get(pk=record["pk"])
|
||||||
|
|
||||||
doc_file = record[EXPORTER_FILE_NAME]
|
doc_file = record[EXPORTER_FILE_NAME]
|
||||||
document_path = self.source / doc_file
|
document_path = self.source / doc_file
|
||||||
@@ -454,26 +416,6 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
|
|
||||||
document.save()
|
document.save()
|
||||||
|
|
||||||
for record in self.track(
|
|
||||||
share_link_bundle_records,
|
|
||||||
description="Copying share link bundles...",
|
|
||||||
):
|
|
||||||
bundle = ShareLinkBundle.objects.get(pk=record["pk"])
|
|
||||||
bundle_file = record[EXPORTER_SHARE_LINK_BUNDLE_NAME]
|
|
||||||
bundle_source_path = (self.source / bundle_file).resolve()
|
|
||||||
bundle_target_path = bundle.absolute_file_path
|
|
||||||
if bundle_target_path is None:
|
|
||||||
raise CommandError(
|
|
||||||
f"Share link bundle {bundle.pk} does not have a valid file path.",
|
|
||||||
)
|
|
||||||
|
|
||||||
with FileLock(settings.MEDIA_LOCK):
|
|
||||||
bundle_target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
copy_file_with_basic_stats(
|
|
||||||
bundle_source_path,
|
|
||||||
bundle_target_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _decrypt_record_if_needed(self, record: dict) -> dict:
|
def _decrypt_record_if_needed(self, record: dict) -> dict:
|
||||||
fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", ""))
|
fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", ""))
|
||||||
if fields:
|
if fields:
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
EXPORTER_FILE_NAME = "__exported_file_name__"
|
EXPORTER_FILE_NAME = "__exported_file_name__"
|
||||||
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
||||||
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
|
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
|
||||||
EXPORTER_SHARE_LINK_BUNDLE_NAME = "__exported_share_link_bundle_name__"
|
|
||||||
|
|
||||||
EXPORTER_CRYPTO_SETTINGS_NAME = "__crypto__"
|
EXPORTER_CRYPTO_SETTINGS_NAME = "__crypto__"
|
||||||
EXPORTER_CRYPTO_SALT_NAME = "__salt_hex__"
|
EXPORTER_CRYPTO_SALT_NAME = "__salt_hex__"
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import hashlib
|
|||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from datetime import timedelta
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
@@ -12,7 +11,6 @@ import pytest
|
|||||||
from allauth.socialaccount.models import SocialAccount
|
from allauth.socialaccount.models import SocialAccount
|
||||||
from allauth.socialaccount.models import SocialApp
|
from allauth.socialaccount.models import SocialApp
|
||||||
from allauth.socialaccount.models import SocialToken
|
from allauth.socialaccount.models import SocialToken
|
||||||
from django.conf import settings
|
|
||||||
from django.contrib.auth.models import Group
|
from django.contrib.auth.models import Group
|
||||||
from django.contrib.auth.models import Permission
|
from django.contrib.auth.models import Permission
|
||||||
from django.contrib.contenttypes.models import ContentType
|
from django.contrib.contenttypes.models import ContentType
|
||||||
@@ -33,8 +31,6 @@ from documents.models import CustomFieldInstance
|
|||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import Note
|
from documents.models import Note
|
||||||
from documents.models import ShareLink
|
|
||||||
from documents.models import ShareLinkBundle
|
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.models import User
|
from documents.models import User
|
||||||
@@ -43,7 +39,6 @@ from documents.models import WorkflowAction
|
|||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.sanity_checker import check_sanity
|
from documents.sanity_checker import check_sanity
|
||||||
from documents.settings import EXPORTER_FILE_NAME
|
from documents.settings import EXPORTER_FILE_NAME
|
||||||
from documents.settings import EXPORTER_SHARE_LINK_BUNDLE_NAME
|
|
||||||
from documents.tests.utils import DirectoriesMixin
|
from documents.tests.utils import DirectoriesMixin
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
from documents.tests.utils import SampleDirMixin
|
from documents.tests.utils import SampleDirMixin
|
||||||
@@ -311,108 +306,6 @@ class TestExportImport(
|
|||||||
):
|
):
|
||||||
self.test_exporter(use_filename_format=True)
|
self.test_exporter(use_filename_format=True)
|
||||||
|
|
||||||
def test_exporter_includes_share_links_and_bundles(self) -> None:
|
|
||||||
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
|
||||||
shutil.copytree(
|
|
||||||
Path(__file__).parent / "samples" / "documents",
|
|
||||||
Path(self.dirs.media_dir) / "documents",
|
|
||||||
)
|
|
||||||
|
|
||||||
share_link = ShareLink.objects.create(
|
|
||||||
slug="share-link-slug",
|
|
||||||
document=self.d1,
|
|
||||||
owner=self.user,
|
|
||||||
file_version=ShareLink.FileVersion.ORIGINAL,
|
|
||||||
expiration=timezone.now() + timedelta(days=7),
|
|
||||||
)
|
|
||||||
|
|
||||||
bundle_relative_path = Path("nested") / "share-bundle.zip"
|
|
||||||
bundle_source_path = settings.SHARE_LINK_BUNDLE_DIR / bundle_relative_path
|
|
||||||
bundle_source_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
bundle_source_path.write_bytes(b"share-bundle-contents")
|
|
||||||
bundle = ShareLinkBundle.objects.create(
|
|
||||||
slug="share-bundle-slug",
|
|
||||||
owner=self.user,
|
|
||||||
file_version=ShareLink.FileVersion.ARCHIVE,
|
|
||||||
expiration=timezone.now() + timedelta(days=7),
|
|
||||||
status=ShareLinkBundle.Status.READY,
|
|
||||||
size_bytes=bundle_source_path.stat().st_size,
|
|
||||||
file_path=str(bundle_relative_path),
|
|
||||||
built_at=timezone.now(),
|
|
||||||
)
|
|
||||||
bundle.documents.set([self.d1, self.d2])
|
|
||||||
|
|
||||||
manifest = self._do_export()
|
|
||||||
|
|
||||||
share_link_records = [
|
|
||||||
record for record in manifest if record["model"] == "documents.sharelink"
|
|
||||||
]
|
|
||||||
self.assertEqual(len(share_link_records), 1)
|
|
||||||
self.assertEqual(share_link_records[0]["pk"], share_link.pk)
|
|
||||||
self.assertEqual(share_link_records[0]["fields"]["document"], self.d1.pk)
|
|
||||||
self.assertEqual(share_link_records[0]["fields"]["owner"], self.user.pk)
|
|
||||||
|
|
||||||
share_link_bundle_records = [
|
|
||||||
record
|
|
||||||
for record in manifest
|
|
||||||
if record["model"] == "documents.sharelinkbundle"
|
|
||||||
]
|
|
||||||
self.assertEqual(len(share_link_bundle_records), 1)
|
|
||||||
bundle_record = share_link_bundle_records[0]
|
|
||||||
self.assertEqual(bundle_record["pk"], bundle.pk)
|
|
||||||
self.assertEqual(
|
|
||||||
bundle_record["fields"]["documents"],
|
|
||||||
[self.d1.pk, self.d2.pk],
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
bundle_record[EXPORTER_SHARE_LINK_BUNDLE_NAME],
|
|
||||||
"share_link_bundles/nested/share-bundle.zip",
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
bundle_record["fields"]["file_path"],
|
|
||||||
"nested/share-bundle.zip",
|
|
||||||
)
|
|
||||||
self.assertIsFile(self.target / bundle_record[EXPORTER_SHARE_LINK_BUNDLE_NAME])
|
|
||||||
|
|
||||||
with paperless_environment():
|
|
||||||
ShareLink.objects.all().delete()
|
|
||||||
ShareLinkBundle.objects.all().delete()
|
|
||||||
shutil.rmtree(settings.SHARE_LINK_BUNDLE_DIR, ignore_errors=True)
|
|
||||||
|
|
||||||
call_command(
|
|
||||||
"document_importer",
|
|
||||||
"--no-progress-bar",
|
|
||||||
self.target,
|
|
||||||
skip_checks=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
imported_share_link = ShareLink.objects.get(pk=share_link.pk)
|
|
||||||
self.assertEqual(imported_share_link.document_id, self.d1.pk)
|
|
||||||
self.assertEqual(imported_share_link.owner_id, self.user.pk)
|
|
||||||
self.assertEqual(
|
|
||||||
imported_share_link.file_version,
|
|
||||||
ShareLink.FileVersion.ORIGINAL,
|
|
||||||
)
|
|
||||||
|
|
||||||
imported_bundle = ShareLinkBundle.objects.get(pk=bundle.pk)
|
|
||||||
imported_bundle_path = imported_bundle.absolute_file_path
|
|
||||||
self.assertEqual(imported_bundle.owner_id, self.user.pk)
|
|
||||||
self.assertEqual(
|
|
||||||
list(
|
|
||||||
imported_bundle.documents.order_by("pk").values_list(
|
|
||||||
"pk",
|
|
||||||
flat=True,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
[self.d1.pk, self.d2.pk],
|
|
||||||
)
|
|
||||||
self.assertEqual(imported_bundle.file_path, "nested/share-bundle.zip")
|
|
||||||
self.assertIsNotNone(imported_bundle_path)
|
|
||||||
self.assertEqual(
|
|
||||||
imported_bundle_path.read_bytes(),
|
|
||||||
b"share-bundle-contents",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_update_export_changed_time(self) -> None:
|
def test_update_export_changed_time(self) -> None:
|
||||||
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
||||||
shutil.copytree(
|
shutil.copytree(
|
||||||
@@ -496,7 +389,7 @@ class TestExportImport(
|
|||||||
self.assertIsFile(
|
self.assertIsFile(
|
||||||
str(self.target / doc_from_manifest[EXPORTER_FILE_NAME]),
|
str(self.target / doc_from_manifest[EXPORTER_FILE_NAME]),
|
||||||
)
|
)
|
||||||
self.d3.hard_delete()
|
self.d3.delete()
|
||||||
|
|
||||||
manifest = self._do_export()
|
manifest = self._do_export()
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
@@ -975,52 +868,6 @@ class TestExportImport(
|
|||||||
for obj in manifest:
|
for obj in manifest:
|
||||||
self.assertNotEqual(obj["model"], "auditlog.logentry")
|
self.assertNotEqual(obj["model"], "auditlog.logentry")
|
||||||
|
|
||||||
def test_export_import_soft_deleted_document(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- A document with a note and custom field instance has been soft-deleted
|
|
||||||
WHEN:
|
|
||||||
- Export and re-import are performed
|
|
||||||
THEN:
|
|
||||||
- The soft-deleted document, note, and custom field instance
|
|
||||||
survive the round-trip with deleted_at preserved
|
|
||||||
"""
|
|
||||||
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
|
||||||
shutil.copytree(
|
|
||||||
Path(__file__).parent / "samples" / "documents",
|
|
||||||
Path(self.dirs.media_dir) / "documents",
|
|
||||||
)
|
|
||||||
|
|
||||||
# d1 has self.note and self.cfi1 attached via setUp
|
|
||||||
self.d1.delete()
|
|
||||||
|
|
||||||
self._do_export()
|
|
||||||
|
|
||||||
with paperless_environment():
|
|
||||||
Document.global_objects.all().hard_delete()
|
|
||||||
Correspondent.objects.all().delete()
|
|
||||||
DocumentType.objects.all().delete()
|
|
||||||
Tag.objects.all().delete()
|
|
||||||
|
|
||||||
call_command(
|
|
||||||
"document_importer",
|
|
||||||
"--no-progress-bar",
|
|
||||||
self.target,
|
|
||||||
skip_checks=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(Document.global_objects.count(), 4)
|
|
||||||
reimported_doc = Document.global_objects.get(pk=self.d1.pk)
|
|
||||||
self.assertIsNotNone(reimported_doc.deleted_at)
|
|
||||||
|
|
||||||
self.assertEqual(Note.global_objects.count(), 1)
|
|
||||||
reimported_note = Note.global_objects.get(pk=self.note.pk)
|
|
||||||
self.assertIsNotNone(reimported_note.deleted_at)
|
|
||||||
|
|
||||||
self.assertEqual(CustomFieldInstance.global_objects.count(), 1)
|
|
||||||
reimported_cfi = CustomFieldInstance.global_objects.get(pk=self.cfi1.pk)
|
|
||||||
self.assertIsNotNone(reimported_cfi.deleted_at)
|
|
||||||
|
|
||||||
def test_export_data_only(self) -> None:
|
def test_export_data_only(self) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
|
|||||||
Reference in New Issue
Block a user