diff --git a/src/paperless_ai/taxonomy.py b/src/paperless_ai/taxonomy.py index 498c1b3e1..dc320979c 100644 --- a/src/paperless_ai/taxonomy.py +++ b/src/paperless_ai/taxonomy.py @@ -55,3 +55,35 @@ def build_taxonomy_hints_from_nodes( correspondents=sorted(correspondents), storage_paths=sorted(storage_paths), ) + + +_HINT_INSTRUCTION = ( + "Prefer existing names from these lists verbatim. Only propose a new value " + "if none of the existing names fits." +) + + +def format_hints_for_prompt(hints: TaxonomyHints) -> str: + """Render non-empty hint categories as labelled blocks plus one instruction. + + Returns "" when every category is empty, so callers can treat the result + the same as no hints at all. + """ + # Literal-key access keeps this TypedDict-safe for mypy; the order here is + # the order the blocks appear in the prompt. + labelled_values: list[tuple[str, list[str]]] = [ + ("Available tags", hints["tags"]), + ("Available document types", hints["document_types"]), + ("Available correspondents", hints["correspondents"]), + ("Available storage paths", hints["storage_paths"]), + ] + blocks: list[str] = [] + for label, values in labelled_values: + if values: + listing = "\n".join(f"- {value}" for value in values) + blocks.append(f"{label}:\n{listing}") + + if not blocks: + return "" + + return "\n\n".join([*blocks, _HINT_INSTRUCTION]) diff --git a/src/paperless_ai/tests/test_taxonomy.py b/src/paperless_ai/tests/test_taxonomy.py index 0aad2cb59..0f4cbc45a 100644 --- a/src/paperless_ai/tests/test_taxonomy.py +++ b/src/paperless_ai/tests/test_taxonomy.py @@ -1,6 +1,8 @@ from types import SimpleNamespace +from paperless_ai.taxonomy import TaxonomyHints from paperless_ai.taxonomy import build_taxonomy_hints_from_nodes +from paperless_ai.taxonomy import format_hints_for_prompt def make_node(**metadata: object) -> SimpleNamespace: @@ -77,3 +79,51 @@ class TestBuildTaxonomyHintsFromNodes: assert build_taxonomy_hints_from_nodes( nodes, ) == build_taxonomy_hints_from_nodes(nodes) + + +class TestFormatHintsForPrompt: + def test_all_blocks_present_when_all_categories_nonempty(self) -> None: + hints: TaxonomyHints = { + "tags": ["Bloodwork"], + "document_types": ["Invoice"], + "correspondents": ["IRS"], + "storage_paths": ["Financial"], + } + result = format_hints_for_prompt(hints) + assert "Available tags:" in result + assert "Available document types:" in result + assert "Available correspondents:" in result + assert "Available storage paths:" in result + assert "- Bloodwork" in result + + def test_empty_category_produces_no_block(self) -> None: + hints: TaxonomyHints = { + "tags": ["Bloodwork"], + "document_types": [], + "correspondents": [], + "storage_paths": [], + } + result = format_hints_for_prompt(hints) + assert "Available tags:" in result + assert "Available document types:" not in result + assert "Available correspondents:" not in result + assert "Available storage paths:" not in result + + def test_all_empty_produces_empty_string(self) -> None: + hints: TaxonomyHints = { + "tags": [], + "document_types": [], + "correspondents": [], + "storage_paths": [], + } + assert format_hints_for_prompt(hints) == "" + + def test_instruction_line_appears_once(self) -> None: + hints: TaxonomyHints = { + "tags": ["Bloodwork"], + "document_types": ["Invoice"], + "correspondents": [], + "storage_paths": [], + } + result = format_hints_for_prompt(hints) + assert result.count("Prefer existing names from these lists verbatim") == 1