From 35dda7c0a623d4cbfe7b8506e938fc8b45f02e0d Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Thu, 23 Apr 2026 01:37:38 -0400 Subject: [PATCH] Fix map-maintenance tooling and stale classifications (#713) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sortlists.py had three bugs that let bad data through: - The `type` column validator was keyed on "Type" (capital T) but the CSV header is "type" (lowercase), so every row bypassed validation. - `types` was read via `f.readlines()` without stripping, so even if the key had matched, values like `"ISP\n"` would never equal `"ISP"`. - The map was sorted case-sensitively, but README and AGENTS.md both state the map is sorted alphabetically case-insensitive. Fixing the validator surfaced eight pre-existing rows with invalid or inconsistent `type` values. All are now corrected: - Two types listed in README but missing from base_reverse_dns_types.txt (Religion, Utilities) have been added so the README and authoritative types file agree. - dhl.com, ghm-grenoble.fr, regusnet.com had lowercase-casing type values (`logistics`, `healthcare`, `Real estate`) corrected to match the canonical spellings. - lodestonegroup.com was typed `Insurance`, which is not a listed industry; reclassified as `Finance` (the closest listed category for an insurance brokerage). Also fixes one stale map entry: `rt.ru` was listed as `RT,Government Media`, conflating Rostelecom (the Russian telco that owns and uses rt.ru) with RT / Russia Today (which uses rt.com). Corrected to `Rostelecom,ISP`. Switching to case-insensitive sort moves exactly one row — the sole mixed-case key `United-domains.de` — from the top of the file (where ASCII ordering placed it before all lowercase keys) into the "united" range where human readers would expect it. Co-authored-by: Sean Whalen Co-authored-by: Claude Opus 4.7 (1M context) --- parsedmarc/resources/maps/base_reverse_dns_map.csv | 12 ++++++------ .../resources/maps/base_reverse_dns_types.txt | 2 ++ parsedmarc/resources/maps/sortlists.py | 13 ++++++++----- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/parsedmarc/resources/maps/base_reverse_dns_map.csv b/parsedmarc/resources/maps/base_reverse_dns_map.csv index fa209e3..f772dc0 100644 --- a/parsedmarc/resources/maps/base_reverse_dns_map.csv +++ b/parsedmarc/resources/maps/base_reverse_dns_map.csv @@ -27,7 +27,6 @@ base_reverse_dns,name,type 47.pl,AfterMarket.pl,Web Host 4gbhost.com,4GBHost,Web Host 99cloudhosting.com,99CloudHosting,Web Host -United-domains.de,United Domains,Web Host a-hadar.co.il,Hadar Group,Real Estate a1.bg,A1,ISP a1.hr,A1,ISP @@ -730,7 +729,7 @@ dfn.nl,DELTA Fiber,ISP dgeduf.or.kr,Daegu Dong-gu Education Foundation,Education dgsys.es,DGsys,Web Host dhamma.org,Vipassana Meditation,Nonprofit -dhl.com,DHL,logistics +dhl.com,DHL,Logistics diako-dresden.de,Diakonissenanstalt Dresden,Healthcare diakovere.de,DIAKOVERE,Healthcare dialego.de,Dialego,Marketing @@ -1122,7 +1121,7 @@ getordained.org,Universal Life Church,Religion getresponse.com,GetResponse,Marketing gfiber.com,Google Fiber,ISP ggsv.jp,Techorus,Web Host -ghm-grenoble.fr,Groupe hospitalier mutualiste de Grenoble,healthcare +ghm-grenoble.fr,Groupe hospitalier mutualiste de Grenoble,Healthcare giantpartners.com,Giant Partners,Marketing gigahost.dk,GigaHost,Web Host glasgow-ky.com,Glasgow EPB,ISP @@ -1711,7 +1710,7 @@ loading.es,Loading,Web Host locaweb.com.br,Locaweb,Web Host locus-t.com.my,LOCUS-T,Marketing locustwalk.cc,Locust Walk,Finance -lodestonegroup.com,Loadstone Insurance,Insurance +lodestonegroup.com,Loadstone Insurance,Finance loftsinc.com,"Lofts, Inc.",Retail logika.ro,Logika IT Solutions,MSP logisticintegrators.com,Logistics Integrators,Logistics @@ -2541,7 +2540,7 @@ regionaltelecom.net.br,Regional Telecom,ISP register.it,Register.it,Web Host registeredsite.com,Web.com,Web Host registrar-servers.com,Namecheap,Web Host -regusnet.com,Regus,Real estate +regusnet.com,Regus,Real Estate regxa.com,Regxa Cloud,Web Host relativity.one,RelativityOne,SaaS relmax.net,Relmax,Web Host @@ -2583,7 +2582,7 @@ rrtelecomrs.net.br,RR Telecom,ISP rsa.com,RSA,Technology rsgsv.net,Intuit Mailchimp,Marketing rssulnet.com.br,RS Sul Net,ISP -rt.ru,RT,Government Media +rt.ru,Rostelecom,ISP rtctel.com,Ringgold Telephone Company,ISP ruelala.com,Rue La La,Retail runbox.com,Runbox,Email Provider @@ -3245,6 +3244,7 @@ unina.it,University of Naples Federico II,Education uninet-ide.com.mx,Telmex,ISP uninett.no,Uninett,Education unionbank.com.bd,Union Bank,Finance +United-domains.de,United Domains,Web Host unitasglobal.com,Unitas Global,MSP united.net,United Communications,ISP unitedyacht.com,United Yacht Sales,Retail diff --git a/parsedmarc/resources/maps/base_reverse_dns_types.txt b/parsedmarc/resources/maps/base_reverse_dns_types.txt index cbbecb9..76c8d7f 100644 --- a/parsedmarc/resources/maps/base_reverse_dns_types.txt +++ b/parsedmarc/resources/maps/base_reverse_dns_types.txt @@ -32,6 +32,7 @@ Physical Security Print Publishing Real Estate +Religion Retail SaaS Science @@ -41,4 +42,5 @@ Sports Staffing Technology Travel +Utilities Web Host diff --git a/parsedmarc/resources/maps/sortlists.py b/parsedmarc/resources/maps/sortlists.py index e914219..fb91ef2 100755 --- a/parsedmarc/resources/maps/sortlists.py +++ b/parsedmarc/resources/maps/sortlists.py @@ -156,11 +156,9 @@ def _main(): types_file = "base_reverse_dns_types.txt" with open(types_file) as f: - types = f.readlines() - while "" in types: - types.remove("") + types = [line.strip() for line in f if line.strip()] - map_allowed_values = {"Type": types} + map_allowed_values = {"type": types} for list_file in list_files: if not os.path.exists(list_file): @@ -175,7 +173,12 @@ def _main(): print(f"Error: {map_file} does not exist") exit(1) try: - sort_csv(map_file, map_key, allowed_values=map_allowed_values) + sort_csv( + map_file, + map_key, + case_insensitive_sort=True, + allowed_values=map_allowed_values, + ) except CSVValidationError as e: print(f"{map_file} did not validate: {e}")