mirror of
https://github.com/domainaware/parsedmarc.git
synced 2026-06-11 13:09:44 +00:00
3839cfff6f
A meaningful share of KU domains return a Cloudflare / DDoS-Guard / "Are you a robot?" / px-captcha interstitial instead of real homepage content — even after the curl-style relaxed-TLS fallback runs. For those rows we have neither homepage signal nor (often) a usable as_name, and they fall through to KU even though the operator is a real (often well-known) business that the classifier could trivially handle if it could just see the page. Added an opt-in `--use-search-fallback` flag that asks DuckDuckGo for `site:<domain>` when the homepage fetch returned a bot-block / parking / empty result, and uses the top result's title and description (only if the result host belongs to the input domain — anti-SEO-spam guard). Mechanism - New optional `ddgs` dependency, listed under the `[build]` extras. `from ddgs import DDGS` is wrapped in a try/except — the script runs without ddgs installed as long as `--use-search-fallback` isn't passed; the flag check exits with a helpful install message otherwise. - `_SEARCH_FALLBACK_TRIGGER_RE` — title/description patterns that look like a bot-block / WAF interstitial / parked / placeholder. Triggers the fallback. Same shape as the classifier's TITLE_NOISE_RE / PARKED_PAGE_RE; the search fallback is the recovery path for exactly the rows that filter excludes. - `_looks_bot_blocked()` — combined check: trigger regex matches OR title and description are both empty (typical of WAF interstitials that strip <title>/<meta> entirely). - `_hosts_match()` — same-domain SEO-spam guard. A search result is accepted only when its host is exactly the input domain or a subdomain of it. Third-party SEO-spam pages that scraped the domain name are silently skipped. - `_search_fallback_fetch()` — runs `site:<domain>` through DDG, walks results in rank order, returns the first one whose host passes the guard. Returns empty if no result matches (caller leaves the row's homepage data alone in that case). - `_collect_one()` now takes a `use_search_fallback` flag, calls the fallback after the homepage fetch when the homepage looks bot-blocked, and writes `title_source = "homepage"` or `"search"` so reviewers can audit which rows came from where. - New `title_source` column in the TSV. Smoke test Test set: bbc.com (real homepage, no fallback expected) plus 5 known Cloudflare-walled rows (1800contacts.com, americaneagle.com, broadwaytechnology.com, health.gov.il, mfa.gov.il). Result: bbc.com classified via homepage; the other 5 all recovered title + description via search and got `title_source=search`. The same-domain guard validated independently — for broadwaytechnology.com the guard correctly rejects bloomberg.com and accepts support.broadwaytechnology.com (broadway was acquired by Bloomberg, but the search fallback returns the broadway-domain snippet, not the parent's bloomberg.com product page). Caveats codified in AGENTS.md - Search snippets are still untrusted text (data-not-instructions rule applies the same way it does to homepage HTML). - DDG's index can lag a homepage rebrand by months — when a row classified via `title_source=search` disagrees with a fresh manual fetch, prefer the manual verification. The fallback is a recovery aid, not a tiebreaker against fresh content. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
98 lines
2.3 KiB
TOML
98 lines
2.3 KiB
TOML
[build-system]
|
|
requires = [
|
|
"hatchling>=1.27.0",
|
|
]
|
|
requires_python = ">=3.10,<3.15"
|
|
build-backend = "hatchling.build"
|
|
|
|
[project]
|
|
name = "parsedmarc"
|
|
dynamic = [
|
|
"version",
|
|
]
|
|
description = "A Python package and CLI for parsing aggregate and forensic DMARC reports"
|
|
readme = "README.md"
|
|
license = "Apache-2.0"
|
|
authors = [
|
|
{ name = "Sean Whalen", email = "whalenster@gmail.com" },
|
|
]
|
|
keywords = [
|
|
"DMARC",
|
|
"parser",
|
|
"reporting",
|
|
]
|
|
classifiers = [
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Information Technology",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Operating System :: OS Independent",
|
|
"Programming Language :: Python :: 3"
|
|
]
|
|
requires-python = ">=3.10"
|
|
dependencies = [
|
|
"azure-identity>=1.8.0",
|
|
"azure-monitor-ingestion>=1.0.0",
|
|
"boto3>=1.16.63",
|
|
"dateparser>=1.1.1",
|
|
"dnspython>=2.0.0",
|
|
"elasticsearch-dsl==7.4.0",
|
|
"elasticsearch<7.14.0",
|
|
"expiringdict>=1.1.4",
|
|
"kafka-python-ng>=2.2.2",
|
|
"lxml>=4.4.0",
|
|
"mailsuite[gmail,msgraph]>=2.0.2",
|
|
"maxminddb>=2.0.0",
|
|
"opensearch-py>=2.4.2,<=4.0.0",
|
|
"publicsuffixlist>=0.10.0",
|
|
"pygelf>=0.4.2",
|
|
"requests>=2.22.0",
|
|
"tqdm>=4.31.1",
|
|
"urllib3>=1.25.7",
|
|
"xmltodict>=0.12.0",
|
|
"PyYAML>=6.0.3"
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
build = [
|
|
# Used only by maintainer tooling under parsedmarc/resources/maps/ —
|
|
# `collect_domain_info.py --use-search-fallback` falls back to a
|
|
# DuckDuckGo search when the homepage fetch returns a bot-block / parked
|
|
# / empty page. Optional import; the script runs without it as long as
|
|
# the fallback flag isn't passed.
|
|
"ddgs>=9.0.0",
|
|
"hatch>=1.14.0",
|
|
"myst-parser[linkify]",
|
|
"nose",
|
|
"pytest",
|
|
"pytest-cov",
|
|
"ruff",
|
|
"sphinx",
|
|
"sphinx_rtd_theme",
|
|
]
|
|
|
|
[project.scripts]
|
|
parsedmarc = "parsedmarc.cli:_main"
|
|
|
|
[project.urls]
|
|
Homepage = "https://domainaware.github.io/parsedmarc"
|
|
|
|
[tool.hatch.version]
|
|
path = "parsedmarc/constants.py"
|
|
|
|
[tool.hatch.build.targets.sdist]
|
|
include = [
|
|
"/parsedmarc",
|
|
]
|
|
|
|
[tool.hatch.build]
|
|
exclude = [
|
|
"base_reverse_dns.csv",
|
|
"find_bad_utf8.py",
|
|
"find_unknown_base_reverse_dns.py",
|
|
"unknown_base_reverse_dns.csv",
|
|
"sortmaps.py",
|
|
"README.md",
|
|
"*.bak"
|
|
]
|