diff --git a/CHANGELOG.md b/CHANGELOG.md index c5e5b2b..79a0bc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## 9.11.2 + +### Changes + +- **`base_reverse_dns_types.txt` removed; `sortlists.py` now reads the authoritative `type` list directly from `parsedmarc/resources/maps/README.md`.** The README's industry list (between new `` / `` HTML-comment markers) is now the single source of truth, eliminating the drift risk between the data file and the documented list. Before validating the map, `sortlists.py` also normalizes the README block in place: trims whitespace, deduplicates case-insensitively (errors on case-conflicting entries), and sorts entries alphabetically — so adding a new type is just inserting a `- New Type` line anywhere inside the markers. Also fixes a pre-existing typo in the precedence rules where rule 4 said `Web Hosting` but the canonical type used in 4,176 map rows is `Web Host`. +- **Maintenance tooling no longer ships in the wheel/sdist.** The Python scripts under `parsedmarc/resources/maps/` (`collect_domain_info.py`, `classify_unknown_domains.py`, `detect_psl_overrides.py`, `detect_rebrands.py`, `sortlists.py`, plus the previously-already-excluded `find_bad_utf8.py` and `find_unknown_base_reverse_dns.py`) are maintainer-only batch tooling, not parsedmarc runtime code. They have always been in the repository for convenience but were unnecessarily included in distributions, pulling reviewer attention and contributing nothing to end-user functionality. The build now excludes any `.py` file under `parsedmarc/resources/maps/` whose name doesn't start with an underscore via a single glob pattern (`parsedmarc/resources/maps/[!_]*.py`), so future maintainer scripts added to that directory are excluded automatically while `__init__.py` continues to ship. The directory's `__init__.py` and the runtime data files (`base_reverse_dns_map.csv`, `known_unknown_base_reverse_dns.txt`, `psl_overrides.txt`) continue to ship — they're loaded at runtime via `importlib.resources.files(parsedmarc.resources.maps)`. + ## 9.11.1 ### Fixed diff --git a/parsedmarc/constants.py b/parsedmarc/constants.py index 70df60e..f4ab8dc 100644 --- a/parsedmarc/constants.py +++ b/parsedmarc/constants.py @@ -1,4 +1,4 @@ -__version__ = "9.11.1" +__version__ = "9.11.2" USER_AGENT = f"parsedmarc/{__version__}" diff --git a/parsedmarc/resources/maps/README.md b/parsedmarc/resources/maps/README.md index 0393bce..de58a4f 100644 --- a/parsedmarc/resources/maps/README.md +++ b/parsedmarc/resources/maps/README.md @@ -19,11 +19,12 @@ The `service_type` is based on the following rule precedence: 1. All email security services are identified as `Email Security`, no matter how or where they are hosted. 2. All marketing services are identified as `Marketing`, no matter how or where they are hosted. 3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting. -4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting. +4. All web hosting providers are identified as `Web Host`, even if the service also offers email hosting. 5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted 6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics. 7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry + - Agriculture - Automotive - Beauty @@ -70,6 +71,9 @@ The `service_type` is based on the following rule precedence: - Travel - Utilities - Web Host + + +The list above is the authoritative set of allowed `type` values; `sortlists.py` parses the bullet items between the `` and `` HTML comment markers and uses them to validate every row's `type` column. Before validating the map, it also normalizes the block in place: trims whitespace, deduplicates case-insensitively, and sorts the entries alphabetically — so adding a new type is just a matter of inserting a `- New Type` line anywhere inside the markers, and `sortlists.py` will tidy it on the next run. Keep the markers themselves intact when editing. The file currently contains over 5,000 mappings from a wide variety of email sending sources. @@ -97,10 +101,6 @@ A CSV with the fields `source_name` and optionally `message_count`. This CSV can A CSV file with the fields `source_name` and `message_count`. This file is not tracked by Git. -## base_reverse_dns_types.txt - -A plaintext list (one per line) of the allowed `type` values. Should match the industry list in this README; used by `sortlists.py` as the authoritative set for validation. - ## psl_overrides.txt A plaintext list of reverse-DNS suffixes used to fold noisy subdomain patterns down to a single base. Each line is a suffix with an optional leading separator: @@ -181,4 +181,4 @@ The output of `detect_rebrands.py`. Tab-separated, one row per flagged map key. ## sortlists.py -Validation and sorting helper invoked as a module. Alphabetically sorts `base_reverse_dns_map.csv` (case-insensitive by first column, preserving CRLF line endings), deduplicates entries, validates that every `type` appears in `base_reverse_dns_types.txt`, and warns on names that contain unescaped commas or stray whitespace. Run it after any batch merge before committing. +Validation and sorting helper invoked as a module. Alphabetically sorts `base_reverse_dns_map.csv` (case-insensitive by first column, preserving CRLF line endings), deduplicates entries, validates that every `type` appears in this README's authoritative type list (parsed from the `` / `` block above), and warns on names that contain unescaped commas or stray whitespace. Run it after any batch merge before committing. diff --git a/parsedmarc/resources/maps/base_reverse_dns_types.txt b/parsedmarc/resources/maps/base_reverse_dns_types.txt deleted file mode 100644 index 76c8d7f..0000000 --- a/parsedmarc/resources/maps/base_reverse_dns_types.txt +++ /dev/null @@ -1,46 +0,0 @@ -Agriculture -Automotive -Beauty -Conglomerate -Construction -Consulting -Defense -Education -Email Provider -Email Security -Entertainment -Event Planning -Finance -Food -Government -Government Media -Healthcare -ISP -IaaS -Industrial -Legal -Logistics -MSP -MSSP -Manufacturing -Marketing -News -Nonprofit -PaaS -Photography -Physical Security -Print -Publishing -Real Estate -Religion -Retail -SaaS -Science -Search Engine -Social Media -Sports -Staffing -Technology -Travel -Utilities -Web Host diff --git a/parsedmarc/resources/maps/sortlists.py b/parsedmarc/resources/maps/sortlists.py index fb91ef2..9595b38 100755 --- a/parsedmarc/resources/maps/sortlists.py +++ b/parsedmarc/resources/maps/sortlists.py @@ -4,10 +4,93 @@ from __future__ import annotations import os import csv +import re from pathlib import Path from typing import Mapping, Iterable, Optional, Collection, Union, List, Dict +_TYPES_LIST_RE = re.compile( + r"(.*?)", + re.DOTALL, +) + + +def _parse_types_block(block: str, source: str) -> List[str]: + """Extract type names from the raw text between the marker comments.""" + types: List[str] = [] + for line in block.splitlines(): + stripped = line.strip() + if not stripped: + continue + if not stripped.startswith("- "): + raise ValueError( + f"{source}: unexpected line inside types-list block: {line!r}" + ) + types.append(stripped[2:].strip()) + return types + + +def normalize_types_in_readme(readme_path: Union[str, Path]) -> List[str]: + """Validate, normalize, and load the authoritative `type` list from README.md. + + Trims leading/trailing whitespace from each item, deduplicates + case-insensitively (preserving first-seen casing), and sorts the list + case-insensitively. If the on-disk list differs from the normalized + form, the README is rewritten in place. Returns the normalized list. + + Raises ValueError if the markers are missing, the block is empty, a + line doesn't start with `- `, or two entries differ only by casing. + """ + path = Path(readme_path) + text = path.read_text(encoding="utf-8") + m = _TYPES_LIST_RE.search(text) + if not m: + raise ValueError( + f"{path}: missing / markers" + ) + raw_types = _parse_types_block(m.group(1), str(path)) + if not raw_types: + raise ValueError(f"{path}: types-list block is empty") + + seen: Dict[str, str] = {} + for t in raw_types: + key = t.lower() + if key in seen and seen[key] != t: + raise ValueError( + f"{path}: types-list contains case-conflicting entries: " + f"{seen[key]!r} and {t!r}" + ) + seen.setdefault(key, t) + normalized = sorted(seen.values(), key=str.lower) + + if normalized != raw_types: + new_block = "\n".join(f"- {t}" for t in normalized) + replacement = f"\n{new_block}\n" + new_text = text[: m.start()] + replacement + text[m.end() :] + path.write_text(new_text, encoding="utf-8") + return normalized + + +def load_types_from_readme(readme_path: Union[str, Path]) -> List[str]: + """Read the authoritative `type` list out of README.md without rewriting. + + Use `normalize_types_in_readme` to additionally sort, dedupe, and + rewrite the block in place. This thin wrapper is kept for callers + that only want to read the list (e.g. tests, downstream tools). + """ + path = Path(readme_path) + text = path.read_text(encoding="utf-8") + m = _TYPES_LIST_RE.search(text) + if not m: + raise ValueError( + f"{path}: missing / markers" + ) + types = _parse_types_block(m.group(1), str(path)) + if not types: + raise ValueError(f"{path}: types-list block is empty") + return types + + class CSVValidationError(Exception): def __init__(self, errors: list[str]): super().__init__("\n".join(errors)) @@ -153,10 +236,16 @@ def _main(): map_file = "base_reverse_dns_map.csv" map_key = "base_reverse_dns" list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"] - types_file = "base_reverse_dns_types.txt" + readme_file = "README.md" - with open(types_file) as f: - types = [line.strip() for line in f if line.strip()] + if not os.path.exists(readme_file): + print(f"Error: {readme_file} does not exist") + exit(1) + try: + types = normalize_types_in_readme(readme_file) + except ValueError as e: + print(f"Error: {e}") + exit(1) map_allowed_values = {"type": types} @@ -165,10 +254,6 @@ def _main(): print(f"Error: {list_file} does not exist") exit(1) sort_list_file(list_file) - if not os.path.exists(types_file): - print(f"Error: {types_file} does not exist") - exit(1) - sort_list_file(types_file, lowercase=False) if not os.path.exists(map_file): print(f"Error: {map_file} does not exist") exit(1) diff --git a/pyproject.toml b/pyproject.toml index 0cb7027..f297060 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,10 +88,11 @@ include = [ [tool.hatch.build] exclude = [ "base_reverse_dns.csv", - "find_bad_utf8.py", - "find_unknown_base_reverse_dns.py", "unknown_base_reverse_dns.csv", - "sortmaps.py", "README.md", - "*.bak" + "*.bak", + # Maintenance tooling: any Python file under parsedmarc/resources/maps/ + # whose name doesn't start with `_` (i.e. everything except __init__.py, + # which must keep shipping for `importlib.resources.files()` lookups). + "parsedmarc/resources/maps/[!_]*.py", ]