mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-31 19:04:52 +00:00
feat: add DocumentVersion model, remove version fields from Document
Introduces DocumentVersion as a dedicated model for per-version file data, replacing the self-referential root_document/version_index/ version_label fields on Document. Removes get_effective_content() and simplifies suggestion_content to use Document.content directly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+173
-102
@@ -155,6 +155,173 @@ class StoragePath(MatchingModel):
|
||||
verbose_name_plural = _("storage paths")
|
||||
|
||||
|
||||
class DocumentVersion(models.Model):
|
||||
"""
|
||||
Stores per-version file data for a document.
|
||||
Version 1 is created on initial consume; subsequent uploads add higher numbers.
|
||||
Document.filename / content / checksum always reflect the latest version.
|
||||
DocumentVersion.pk is used as the version ID in API calls.
|
||||
version_number is a per-document sequential integer used for filename suffixes (_v2, etc.).
|
||||
"""
|
||||
|
||||
MAX_STORED_FILENAME_LENGTH: Final[int] = 1024
|
||||
|
||||
document = models.ForeignKey(
|
||||
"Document",
|
||||
on_delete=models.CASCADE,
|
||||
related_name="versions",
|
||||
verbose_name=_("document"),
|
||||
)
|
||||
|
||||
version_number = models.PositiveSmallIntegerField(
|
||||
_("version number"),
|
||||
help_text=_("Sequential version number within this document, starting at 1."),
|
||||
)
|
||||
|
||||
version_label = models.CharField(
|
||||
_("version label"),
|
||||
max_length=64,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text=_("Optional short label for this version."),
|
||||
)
|
||||
|
||||
added = models.DateTimeField(
|
||||
_("added"),
|
||||
default=timezone.now,
|
||||
editable=False,
|
||||
db_index=True,
|
||||
)
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=64,
|
||||
editable=False,
|
||||
help_text=_("SHA-256 checksum of the original file for this version."),
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=64,
|
||||
blank=True,
|
||||
null=True,
|
||||
editable=False,
|
||||
)
|
||||
|
||||
content = models.TextField(
|
||||
_("content"),
|
||||
blank=True,
|
||||
help_text=_("OCR text content of this version."),
|
||||
)
|
||||
|
||||
page_count = models.PositiveIntegerField(
|
||||
_("page count"),
|
||||
blank=True,
|
||||
null=True,
|
||||
validators=[MinValueValidator(1)],
|
||||
)
|
||||
|
||||
mime_type = models.CharField(_("mime type"), max_length=256, editable=False)
|
||||
|
||||
original_filename = models.CharField(
|
||||
_("original filename"),
|
||||
max_length=MAX_STORED_FILENAME_LENGTH,
|
||||
editable=False,
|
||||
null=True,
|
||||
blank=True,
|
||||
)
|
||||
|
||||
filename = models.FilePathField(
|
||||
_("filename"),
|
||||
max_length=MAX_STORED_FILENAME_LENGTH,
|
||||
editable=False,
|
||||
default=None,
|
||||
null=True,
|
||||
help_text=_("Stored filename for this version's original file."),
|
||||
)
|
||||
|
||||
archive_filename = models.FilePathField(
|
||||
_("archive filename"),
|
||||
max_length=MAX_STORED_FILENAME_LENGTH,
|
||||
editable=False,
|
||||
default=None,
|
||||
null=True,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
ordering = ["-version_number"]
|
||||
verbose_name = _("document version")
|
||||
verbose_name_plural = _("document versions")
|
||||
constraints = [
|
||||
models.UniqueConstraint(
|
||||
fields=["document", "version_number"],
|
||||
name="documents_documentversion_doc_number_uniq",
|
||||
),
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"DocumentVersion {self.version_number} of document {self.document_id}"
|
||||
|
||||
@property
|
||||
def source_path(self) -> Path:
|
||||
if self.filename is None:
|
||||
raise ValueError(f"DocumentVersion {self.pk} has no filename set")
|
||||
return (settings.ORIGINALS_DIR / Path(str(self.filename))).resolve()
|
||||
|
||||
@property
|
||||
def has_archive_version(self) -> bool:
|
||||
return self.archive_filename is not None
|
||||
|
||||
@property
|
||||
def archive_path(self) -> Path | None:
|
||||
if self.archive_filename is not None:
|
||||
return (settings.ARCHIVE_DIR / Path(str(self.archive_filename))).resolve()
|
||||
return None
|
||||
|
||||
@property
|
||||
def thumbnail_path(self) -> Path:
|
||||
# Prefix "v" avoids collision with Document thumbnails ({pk:07}.webp)
|
||||
return (settings.THUMBNAIL_DIR / f"v{self.pk:07}.webp").resolve()
|
||||
|
||||
@property
|
||||
def source_file(self):
|
||||
return self.source_path.open("rb")
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return Path(self.archive_path).open("rb")
|
||||
|
||||
@property
|
||||
def thumbnail_file(self):
|
||||
return self.thumbnail_path.open("rb")
|
||||
|
||||
@property
|
||||
def file_type(self) -> str:
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
||||
def get_public_filename(self, *, archive=False, counter=0, suffix=None) -> str:
|
||||
"""
|
||||
Returns a sanitized filename for download, mirroring Document.get_public_filename().
|
||||
Uses the parent document's title and correspondent for the human-readable name,
|
||||
and this version's own mime_type for the file extension.
|
||||
"""
|
||||
doc = self.document # cached FK access -- no extra query if already in memory
|
||||
result = str(doc) # "YYYY-MM-DD [Correspondent] Title" from Document.__str__
|
||||
|
||||
if counter:
|
||||
result += f"_{counter:02}"
|
||||
|
||||
if suffix:
|
||||
result += suffix
|
||||
|
||||
if archive:
|
||||
result += ".pdf"
|
||||
else:
|
||||
result += self.file_type
|
||||
|
||||
return pathvalidate.sanitize_filename(result, replacement_text="-")
|
||||
|
||||
|
||||
class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-missing]
|
||||
MAX_STORED_FILENAME_LENGTH: Final[int] = 1024
|
||||
|
||||
@@ -310,45 +477,10 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
),
|
||||
)
|
||||
|
||||
root_document = models.ForeignKey(
|
||||
"self",
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="versions",
|
||||
on_delete=models.CASCADE,
|
||||
verbose_name=_("root document for this version"),
|
||||
)
|
||||
|
||||
version_index = models.PositiveIntegerField(
|
||||
_("version index"),
|
||||
blank=True,
|
||||
null=True,
|
||||
db_index=True,
|
||||
help_text=_("Index of this version within the root document."),
|
||||
)
|
||||
|
||||
version_label = models.CharField(
|
||||
_("version label"),
|
||||
max_length=64,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text=_("Optional short label for a document version."),
|
||||
)
|
||||
|
||||
class Meta:
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("document")
|
||||
verbose_name_plural = _("documents")
|
||||
constraints = [
|
||||
models.UniqueConstraint(
|
||||
fields=["root_document", "version_index"],
|
||||
condition=models.Q(
|
||||
root_document__isnull=False,
|
||||
version_index__isnull=False,
|
||||
),
|
||||
name="documents_document_root_version_index_uniq",
|
||||
),
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
created = self.created.isoformat()
|
||||
@@ -361,45 +493,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
res += f" {self.title}"
|
||||
return res
|
||||
|
||||
def get_effective_content(self) -> str | None:
|
||||
"""
|
||||
Returns the effective content for the document.
|
||||
|
||||
For root documents, this is the latest version's content when available.
|
||||
For version documents, this is always the document's own content.
|
||||
If the queryset already annotated ``effective_content``, that value is used.
|
||||
"""
|
||||
if hasattr(self, "effective_content"):
|
||||
return getattr(self, "effective_content")
|
||||
|
||||
if self.root_document_id is not None or self.pk is None:
|
||||
return self.content
|
||||
|
||||
prefetched_cache = getattr(self, "_prefetched_objects_cache", None)
|
||||
prefetched_versions = (
|
||||
prefetched_cache.get("versions")
|
||||
if isinstance(prefetched_cache, dict)
|
||||
else None
|
||||
)
|
||||
if prefetched_versions is not None:
|
||||
# Empty list means prefetch ran and found no versions — use own content.
|
||||
if not prefetched_versions:
|
||||
return self.content
|
||||
latest_prefetched = max(prefetched_versions, key=lambda doc: doc.id)
|
||||
return latest_prefetched.content
|
||||
|
||||
latest_version_content = (
|
||||
Document.objects.filter(root_document=self)
|
||||
.order_by("-id")
|
||||
.values_list("content", flat=True)
|
||||
.first()
|
||||
)
|
||||
return (
|
||||
latest_version_content
|
||||
if latest_version_content is not None
|
||||
else self.content
|
||||
)
|
||||
|
||||
@property
|
||||
def suggestion_content(self):
|
||||
"""
|
||||
@@ -412,21 +505,12 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
This improves processing speed for large documents while keeping
|
||||
enough context for accurate suggestions.
|
||||
"""
|
||||
effective_content = self.get_effective_content()
|
||||
if not effective_content or len(effective_content) <= 1200000:
|
||||
return effective_content
|
||||
else:
|
||||
# Use 80% from the start and 20% from the end
|
||||
# to preserve both opening and closing context.
|
||||
head_len = 800000
|
||||
tail_len = 200000
|
||||
|
||||
return " ".join(
|
||||
(
|
||||
effective_content[:head_len],
|
||||
effective_content[-tail_len:],
|
||||
),
|
||||
)
|
||||
content = self.content
|
||||
if not content or len(content) <= 1200000:
|
||||
return content
|
||||
head_len = 800000
|
||||
tail_len = 200000
|
||||
return " ".join((content[:head_len], content[-tail_len:]))
|
||||
|
||||
@property
|
||||
def source_path(self) -> Path:
|
||||
@@ -501,19 +585,6 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
tags_to_add = self.tags.model.objects.filter(id__in=tag_ids)
|
||||
self.tags.add(*tags_to_add)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
# If deleting a root document, move all its versions to trash as well.
|
||||
if self.root_document_id is None:
|
||||
Document.objects.filter(root_document=self).delete()
|
||||
return super().delete(
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class SavedView(ModelWithOwner):
|
||||
class DisplayMode(models.TextChoices):
|
||||
|
||||
Reference in New Issue
Block a user