mirror of
https://github.com/domainaware/parsedmarc.git
synced 2026-04-24 14:29:27 +00:00
Fix map-maintenance tooling and stale classifications (#713)
sortlists.py had three bugs that let bad data through: - The `type` column validator was keyed on "Type" (capital T) but the CSV header is "type" (lowercase), so every row bypassed validation. - `types` was read via `f.readlines()` without stripping, so even if the key had matched, values like `"ISP\n"` would never equal `"ISP"`. - The map was sorted case-sensitively, but README and AGENTS.md both state the map is sorted alphabetically case-insensitive. Fixing the validator surfaced eight pre-existing rows with invalid or inconsistent `type` values. All are now corrected: - Two types listed in README but missing from base_reverse_dns_types.txt (Religion, Utilities) have been added so the README and authoritative types file agree. - dhl.com, ghm-grenoble.fr, regusnet.com had lowercase-casing type values (`logistics`, `healthcare`, `Real estate`) corrected to match the canonical spellings. - lodestonegroup.com was typed `Insurance`, which is not a listed industry; reclassified as `Finance` (the closest listed category for an insurance brokerage). Also fixes one stale map entry: `rt.ru` was listed as `RT,Government Media`, conflating Rostelecom (the Russian telco that owns and uses rt.ru) with RT / Russia Today (which uses rt.com). Corrected to `Rostelecom,ISP`. Switching to case-insensitive sort moves exactly one row — the sole mixed-case key `United-domains.de` — from the top of the file (where ASCII ordering placed it before all lowercase keys) into the "united" range where human readers would expect it. Co-authored-by: Sean Whalen <seanthegeek@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -27,7 +27,6 @@ base_reverse_dns,name,type
|
||||
47.pl,AfterMarket.pl,Web Host
|
||||
4gbhost.com,4GBHost,Web Host
|
||||
99cloudhosting.com,99CloudHosting,Web Host
|
||||
United-domains.de,United Domains,Web Host
|
||||
a-hadar.co.il,Hadar Group,Real Estate
|
||||
a1.bg,A1,ISP
|
||||
a1.hr,A1,ISP
|
||||
@@ -730,7 +729,7 @@ dfn.nl,DELTA Fiber,ISP
|
||||
dgeduf.or.kr,Daegu Dong-gu Education Foundation,Education
|
||||
dgsys.es,DGsys,Web Host
|
||||
dhamma.org,Vipassana Meditation,Nonprofit
|
||||
dhl.com,DHL,logistics
|
||||
dhl.com,DHL,Logistics
|
||||
diako-dresden.de,Diakonissenanstalt Dresden,Healthcare
|
||||
diakovere.de,DIAKOVERE,Healthcare
|
||||
dialego.de,Dialego,Marketing
|
||||
@@ -1122,7 +1121,7 @@ getordained.org,Universal Life Church,Religion
|
||||
getresponse.com,GetResponse,Marketing
|
||||
gfiber.com,Google Fiber,ISP
|
||||
ggsv.jp,Techorus,Web Host
|
||||
ghm-grenoble.fr,Groupe hospitalier mutualiste de Grenoble,healthcare
|
||||
ghm-grenoble.fr,Groupe hospitalier mutualiste de Grenoble,Healthcare
|
||||
giantpartners.com,Giant Partners,Marketing
|
||||
gigahost.dk,GigaHost,Web Host
|
||||
glasgow-ky.com,Glasgow EPB,ISP
|
||||
@@ -1711,7 +1710,7 @@ loading.es,Loading,Web Host
|
||||
locaweb.com.br,Locaweb,Web Host
|
||||
locus-t.com.my,LOCUS-T,Marketing
|
||||
locustwalk.cc,Locust Walk,Finance
|
||||
lodestonegroup.com,Loadstone Insurance,Insurance
|
||||
lodestonegroup.com,Loadstone Insurance,Finance
|
||||
loftsinc.com,"Lofts, Inc.",Retail
|
||||
logika.ro,Logika IT Solutions,MSP
|
||||
logisticintegrators.com,Logistics Integrators,Logistics
|
||||
@@ -2541,7 +2540,7 @@ regionaltelecom.net.br,Regional Telecom,ISP
|
||||
register.it,Register.it,Web Host
|
||||
registeredsite.com,Web.com,Web Host
|
||||
registrar-servers.com,Namecheap,Web Host
|
||||
regusnet.com,Regus,Real estate
|
||||
regusnet.com,Regus,Real Estate
|
||||
regxa.com,Regxa Cloud,Web Host
|
||||
relativity.one,RelativityOne,SaaS
|
||||
relmax.net,Relmax,Web Host
|
||||
@@ -2583,7 +2582,7 @@ rrtelecomrs.net.br,RR Telecom,ISP
|
||||
rsa.com,RSA,Technology
|
||||
rsgsv.net,Intuit Mailchimp,Marketing
|
||||
rssulnet.com.br,RS Sul Net,ISP
|
||||
rt.ru,RT,Government Media
|
||||
rt.ru,Rostelecom,ISP
|
||||
rtctel.com,Ringgold Telephone Company,ISP
|
||||
ruelala.com,Rue La La,Retail
|
||||
runbox.com,Runbox,Email Provider
|
||||
@@ -3245,6 +3244,7 @@ unina.it,University of Naples Federico II,Education
|
||||
uninet-ide.com.mx,Telmex,ISP
|
||||
uninett.no,Uninett,Education
|
||||
unionbank.com.bd,Union Bank,Finance
|
||||
United-domains.de,United Domains,Web Host
|
||||
unitasglobal.com,Unitas Global,MSP
|
||||
united.net,United Communications,ISP
|
||||
unitedyacht.com,United Yacht Sales,Retail
|
||||
|
||||
|
@@ -32,6 +32,7 @@ Physical Security
|
||||
Print
|
||||
Publishing
|
||||
Real Estate
|
||||
Religion
|
||||
Retail
|
||||
SaaS
|
||||
Science
|
||||
@@ -41,4 +42,5 @@ Sports
|
||||
Staffing
|
||||
Technology
|
||||
Travel
|
||||
Utilities
|
||||
Web Host
|
||||
|
||||
@@ -156,11 +156,9 @@ def _main():
|
||||
types_file = "base_reverse_dns_types.txt"
|
||||
|
||||
with open(types_file) as f:
|
||||
types = f.readlines()
|
||||
while "" in types:
|
||||
types.remove("")
|
||||
types = [line.strip() for line in f if line.strip()]
|
||||
|
||||
map_allowed_values = {"Type": types}
|
||||
map_allowed_values = {"type": types}
|
||||
|
||||
for list_file in list_files:
|
||||
if not os.path.exists(list_file):
|
||||
@@ -175,7 +173,12 @@ def _main():
|
||||
print(f"Error: {map_file} does not exist")
|
||||
exit(1)
|
||||
try:
|
||||
sort_csv(map_file, map_key, allowed_values=map_allowed_values)
|
||||
sort_csv(
|
||||
map_file,
|
||||
map_key,
|
||||
case_insensitive_sort=True,
|
||||
allowed_values=map_allowed_values,
|
||||
)
|
||||
except CSVValidationError as e:
|
||||
print(f"{map_file} did not validate: {e}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user