mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-02 18:24:17 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5b8fbdcec7 | |||
| beb048b94a | |||
| b33d11778a |
@@ -191,7 +191,7 @@ jobs:
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
container: mcr.microsoft.com/playwright:v1.61.1-noble
|
||||
container: mcr.microsoft.com/playwright:v1.60.0-noble
|
||||
env:
|
||||
PLAYWRIGHT_BROWSERS_PATH: /ms-playwright
|
||||
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: 1
|
||||
|
||||
+32
-32
@@ -11,17 +11,17 @@
|
||||
},
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@angular/cdk": "^22.0.2",
|
||||
"@angular/common": "~22.0.3",
|
||||
"@angular/compiler": "~22.0.3",
|
||||
"@angular/core": "~22.0.3",
|
||||
"@angular/forms": "~22.0.3",
|
||||
"@angular/localize": "~22.0.3",
|
||||
"@angular/platform-browser": "~22.0.3",
|
||||
"@angular/cdk": "^21.2.12",
|
||||
"@angular/common": "~21.2.17",
|
||||
"@angular/compiler": "~21.2.17",
|
||||
"@angular/core": "~21.2.17",
|
||||
"@angular/forms": "~21.2.14",
|
||||
"@angular/localize": "~21.2.14",
|
||||
"@angular/platform-browser": "~21.2.14",
|
||||
"@angular/platform-browser-dynamic": "~21.2.14",
|
||||
"@angular/router": "~22.0.3",
|
||||
"@ng-bootstrap/ng-bootstrap": "^21.0.0",
|
||||
"@ng-select/ng-select": "^23.0.1",
|
||||
"@angular/router": "~21.2.14",
|
||||
"@ng-bootstrap/ng-bootstrap": "^20.0.0",
|
||||
"@ng-select/ng-select": "^21.8.2",
|
||||
"@ngneat/dirty-check-forms": "^3.0.3",
|
||||
"@popperjs/core": "^2.11.8",
|
||||
"bootstrap": "^5.3.8",
|
||||
@@ -29,42 +29,42 @@
|
||||
"mime-names": "^1.0.0",
|
||||
"ngx-bootstrap-icons": "^1.9.3",
|
||||
"ngx-color": "^10.1.0",
|
||||
"ngx-cookie-service": "^22.0.0",
|
||||
"ngx-device-detector": "^12.0.0",
|
||||
"ngx-cookie-service": "^21.3.1",
|
||||
"ngx-device-detector": "^11.0.0",
|
||||
"ngx-ui-tour-ng-bootstrap": "^18.0.0",
|
||||
"normalize-diacritics": "^5.0.0",
|
||||
"pdfjs-dist": "^5.7.284",
|
||||
"rxjs": "^7.8.2",
|
||||
"tslib": "^2.8.1",
|
||||
"utif": "^3.1.0",
|
||||
"uuid": "^14.0.1",
|
||||
"uuid": "^14.0.0",
|
||||
"zone.js": "^0.16.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@angular-builders/custom-webpack": "^22.0.1",
|
||||
"@angular-builders/jest": "^22.0.1",
|
||||
"@angular-devkit/core": "^22.0.4",
|
||||
"@angular-devkit/schematics": "^22.0.4",
|
||||
"@angular-eslint/builder": "22.0.0",
|
||||
"@angular-eslint/eslint-plugin": "22.0.0",
|
||||
"@angular-eslint/eslint-plugin-template": "22.0.0",
|
||||
"@angular-eslint/schematics": "22.0.0",
|
||||
"@angular-eslint/template-parser": "22.0.0",
|
||||
"@angular/build": "^22.0.4",
|
||||
"@angular/cli": "~22.0.4",
|
||||
"@angular/compiler-cli": "~22.0.3",
|
||||
"@angular-builders/custom-webpack": "^21.0.3",
|
||||
"@angular-builders/jest": "^21.0.3",
|
||||
"@angular-devkit/core": "^21.2.12",
|
||||
"@angular-devkit/schematics": "^21.2.12",
|
||||
"@angular-eslint/builder": "21.4.0",
|
||||
"@angular-eslint/eslint-plugin": "21.4.0",
|
||||
"@angular-eslint/eslint-plugin-template": "21.4.0",
|
||||
"@angular-eslint/schematics": "21.4.0",
|
||||
"@angular-eslint/template-parser": "21.4.0",
|
||||
"@angular/build": "^21.2.12",
|
||||
"@angular/cli": "~21.2.12",
|
||||
"@angular/compiler-cli": "~21.2.14",
|
||||
"@codecov/webpack-plugin": "^2.0.1",
|
||||
"@playwright/test": "^1.61.1",
|
||||
"@playwright/test": "^1.60.0",
|
||||
"@types/jest": "^30.0.0",
|
||||
"@types/node": "^26.0.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.62.0",
|
||||
"@typescript-eslint/parser": "^8.62.0",
|
||||
"@typescript-eslint/utils": "^8.62.0",
|
||||
"eslint": "^10.5.0",
|
||||
"@types/node": "^25.9.1",
|
||||
"@typescript-eslint/eslint-plugin": "^8.60.0",
|
||||
"@typescript-eslint/parser": "^8.60.0",
|
||||
"@typescript-eslint/utils": "^8.60.0",
|
||||
"eslint": "^10.4.0",
|
||||
"jest": "30.4.2",
|
||||
"jest-environment-jsdom": "^30.4.1",
|
||||
"jest-junit": "^17.0.0",
|
||||
"jest-preset-angular": "^17.0.0",
|
||||
"jest-preset-angular": "^16.1.5",
|
||||
"jest-websocket-mock": "^2.5.0",
|
||||
"prettier-plugin-organize-imports": "^4.3.0",
|
||||
"ts-node": "~10.9.1",
|
||||
|
||||
Generated
+3362
-2705
File diff suppressed because it is too large
Load Diff
@@ -48,6 +48,9 @@ _LANGUAGE_MAP: dict[str, str] = {
|
||||
}
|
||||
|
||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||
# Document.title is max_length=128, so use 129 as the limit for
|
||||
# Tantivy's remove_long filter
|
||||
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
|
||||
|
||||
|
||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
@@ -77,10 +80,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
|
||||
|
||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||
builder = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||
.filter(tantivy.Filter.remove_long(65))
|
||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
)
|
||||
@@ -119,12 +122,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
||||
|
||||
|
||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
|
||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(
|
||||
tantivy.Tokenizer.regex(r"\S+"),
|
||||
)
|
||||
.filter(tantivy.Filter.remove_long(65))
|
||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
|
||||
@@ -261,6 +261,36 @@ class TestSearch:
|
||||
== 1
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("search_mode", "query"),
|
||||
[
|
||||
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
|
||||
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
|
||||
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
|
||||
],
|
||||
)
|
||||
def test_search_modes_match_model_limit_title_tokens(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
search_mode: SearchMode,
|
||||
query: str | None,
|
||||
) -> None:
|
||||
"""Search must keep filename-like title tokens up to the model limit."""
|
||||
long_title = "1234567890" * 12 + "12345678"
|
||||
doc = Document.objects.create(
|
||||
title=long_title,
|
||||
content="ordinary content",
|
||||
checksum="TXT12",
|
||||
pk=18,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert backend.search_ids(
|
||||
query or f"title:{long_title}",
|
||||
user=None,
|
||||
search_mode=search_mode,
|
||||
) == [doc.pk]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("mode", "title", "content", "hits", "misses"),
|
||||
[
|
||||
|
||||
@@ -99,6 +99,25 @@ class TestTokenizers:
|
||||
)
|
||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_simple_search_analyzer_supports_model_limit_token_substrings(
|
||||
self,
|
||||
simple_search_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""Simple substring search keeps tokens up to Document.title's model limit."""
|
||||
long_token = "abcdefghij" * 12 + "abcdefgh"
|
||||
writer = simple_search_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("simple_content", long_token)
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
simple_search_index.reload()
|
||||
q = tantivy.Query.regex_query(
|
||||
simple_search_index.schema,
|
||||
"simple_content",
|
||||
".*cdefg.*",
|
||||
)
|
||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
Reference in New Issue
Block a user