diff --git a/AGENTS.md b/AGENTS.md index 67617a3..9ff1199 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -232,6 +232,10 @@ When `unknown_base_reverse_dns.csv` has new entries, follow this order rather th - **Same-domain SEO-spam guard.** Top results that point at a *different* host than the input domain are silently skipped. The classifier's data-not-instructions rule still applies — search-engine snippets are untrusted text — but the same-domain check at least guarantees the snippet was published on a page belonging to the operator we're trying to identify, not a parasitic SEO site that scraped the domain name. - **Stale snippets are real.** DuckDuckGo's index can lag a homepage rebrand by months. When you see a row classified via `title_source=search` whose category disagrees with the current homepage you can reach manually, prefer the manual verification — the search snippet is a recovery aid, not a tiebreaker against fresh content. + + **Link-following: when the search snippet is just a hostname pointer.** DDG sometimes returns titles like `Link to fcs.health.gov.il` (literal placeholder for a subdomain it indexed but never snapshotted) or just `yangon.mfa.gov.il` (bare hostname, no other words). Those snippets carry no classifier signal — there's no description of the operator, no industry vocabulary, just the host name. The collector recognizes both patterns (`Link to ` prefix and bare-hostname-only titles) and follows the pointer: it fetches the target hostname directly with `_fetch_homepage`, and if the fetch returns real (non-bot-blocked) content, replaces the row's title and description with that content. The link target is recorded in a `link_target_domain` column. `title_source` is set to `search→` to make the path auditable. + + When `link_target_domain` is set on a row that classifies, `classify_unknown_domains.py` emits **two** map rows under the same `(name, type)` — the original input *and* the target — so both keys can be looked up. The original input is the "og" domain; the target is what the search engine led us to. Both belong in the map: the same operator may show up in DMARC reports under either base. - `classify_unknown_domains.py` — regex-based multilingual classifier that consumes a `collect_domain_info.py` TSV and emits map / ambiguous / known-unknown additions. Useful for both lookup paths into `base_reverse_dns_map.csv`: the original PTR-side flow (classifying reverse-DNS base domains discovered from DMARC report source IPs) and the MMDB-coverage flow (classifying ASN domains lifted from the bundled IPinfo Lite MMDB). Detectors cover all 44 industry types in the README, and every detector aims for **concept parity across the same broad language pool** — see the concept-parity rule below. The classifier is the regex baseline of step 4 of the unknown-domain workflow (see "Workflow for classifying unknown domains" above) — it catches the obvious cases at scale and leaves the genuinely ambiguous to manual / LLM review. **Three output buckets**. Per-row, the classifier returns one of three states: diff --git a/parsedmarc/resources/maps/classify_unknown_domains.py b/parsedmarc/resources/maps/classify_unknown_domains.py index 87cecfc..f97ccd7 100644 --- a/parsedmarc/resources/maps/classify_unknown_domains.py +++ b/parsedmarc/resources/maps/classify_unknown_domains.py @@ -9775,6 +9775,16 @@ def classify_tsv(input_path: str, mmdb_path: str) -> tuple: hand += 1 continue r = auto_classify(row, domain, as_name) + # When `collect_domain_info.py --use-search-fallback` followed + # a "Link to " / bare-hostname search snippet to a + # different host, that host is recorded in `link_target_domain`. + # The classifier emits two map rows (input + target) under the + # same `(name, type)` so both keys can be looked up. The user + # who introduced this calls the original input the "og domain" + # and the target the operator's actual content host — both + # belong in the map. Skipped when the target matches the input + # exactly (no new information) or the row didn't classify. + link_target = (row.get("link_target_domain") or "").strip().lower() if r is None: ku.append(domain) elif r == ("DROP", None): @@ -9785,11 +9795,19 @@ def classify_tsv(input_path: str, mmdb_path: str) -> tuple: elif len(r) == 2: adds.append((domain, r[0], r[1])) auto += 1 + if link_target and link_target != domain: + adds.append((link_target, r[0], r[1])) + auto += 1 else: # (brand, primary, alternatives) — multi-category match. title = (row.get("title") or "").strip() ambiguous.append((domain, r[0], r[1], r[2], title)) ambig += 1 + # Surface the target alongside the input so the human + # reviewer can adjudicate both with one decision. + if link_target and link_target != domain: + ambiguous.append((link_target, r[0], r[1], r[2], title)) + ambig += 1 return ( adds, ambiguous, diff --git a/parsedmarc/resources/maps/collect_domain_info.py b/parsedmarc/resources/maps/collect_domain_info.py index 6594484..869ce73 100644 --- a/parsedmarc/resources/maps/collect_domain_info.py +++ b/parsedmarc/resources/maps/collect_domain_info.py @@ -79,6 +79,7 @@ FIELDS = [ "ip_whois_country", "error", "title_source", + "link_target_domain", ] USER_AGENT = ( @@ -907,6 +908,44 @@ def _search_fallback_fetch(domain: str, max_results: int = 5) -> dict: return out +# When a DDG search result's title is just a hostname pointer — either +# the literal "Link to " snippet DDG sometimes emits for +# subdomains it has indexed, or a bare hostname with no other words — +# the title has no classifier signal. The right move is to follow the +# pointer: fetch the target hostname directly and use *its* content. +# These two regexes recognize the patterns. +_LINK_TO_TITLE_RE = re.compile(r"^link to\s+(\S+?)\s*$", re.IGNORECASE) +_BARE_HOSTNAME_RE = re.compile( + r"^([a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+)\.?$", + re.IGNORECASE, +) + + +def _extract_link_target(title: str) -> str: + """Return target hostname when the title is just a link/domain pointer. + + Two patterns: + - "Link to " — DDG's literal snippet for some subdomain + results, e.g. "Link to fcs.health.gov.il". + - Just a hostname — the entire title *is* the hostname, e.g. + "yangon.mfa.gov.il". + + Returns "" when neither pattern matches (the search snippet has + real classifier-relevant content and we should use it as-is). + """ + title = (title or "").strip() + if not title: + return "" + m = _LINK_TO_TITLE_RE.match(title) + if m: + candidate = m.group(1).rstrip(".") + if _BARE_HOSTNAME_RE.match(candidate): + return candidate.lower() + if _BARE_HOSTNAME_RE.match(title): + return title.rstrip(".").lower() + return "" + + def _looks_bot_blocked(meta: dict) -> bool: """Decide whether a homepage-fetch result warrants a search-fallback. @@ -954,6 +993,29 @@ def _collect_one( if not row.get("final_url"): row["final_url"] = sf["final_url"] row["title_source"] = "search" + # Link-following: if the search snippet is just a hostname pointer + # ("Link to fcs.health.gov.il" or bare "yangon.mfa.gov.il") it + # carries no classifier signal — the snippet is DDG's placeholder + # for a subdomain it indexed but didn't fully snapshot. Fetch the + # target hostname directly and replace title/desc with its real + # content. The link target is recorded in `link_target_domain` so + # downstream tooling can emit alias map rows when the target is on + # a different registrable domain than the input. + target = _extract_link_target(row.get("title", "")) + if target and target != domain: + row["link_target_domain"] = target + target_meta = _fetch_homepage(target, http_timeout) + if ( + target_meta.get("title") or target_meta.get("description") + ) and not _looks_bot_blocked(target_meta): + row["title"] = target_meta["title"] + row["description"] = target_meta["description"] + row["rebrand_signal"] = target_meta.get("rebrand_signal", "") + row["external_links"] = target_meta.get("external_links", "") + row["final_url"] = target_meta.get("final_url") or row.get( + "final_url", "" + ) + row["title_source"] = f"search→{target}" ips = _resolve_ips(domain) row["ips"] = ",".join(ips[:4]) # WHOIS the first resolved IP — usually reveals the hosting ASN / provider,