Improve list verification

2026-05-20 10:55:24 +00:00 · 2025-08-19 20:02:55 -04:00
parent 5df152d469
commit 4bbd97dbaa
10 changed files with 2119 additions and 1863 deletions
@@ -106,7 +106,7 @@ ENV/
 .idea/

 # VS Code launch config
-.vscode/launch.json
+#.vscode/launch.json

 # Visual Studio Code settings
 #.vscode/
@@ -0,0 +1,31 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python Debugger: Current File",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal"
+    },
+    {
+      "name": "sortlists.py",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "sortlists.py",
+      "cwd": "${workspaceFolder}/parsedmarc/resources/maps",
+      "console": "integratedTerminal"
+    },
+    {
+      "name": "find_unknown_base_reverse_dns.py",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "find_unknown_base_reverse_dns.py",
+      "cwd": "${workspaceFolder}/parsedmarc/resources/maps",
+      "console": "integratedTerminal"
+    }
+  ]
+}
@@ -109,6 +109,7 @@
        "setuptools",
        "smartquotes",
        "SMTPTLS",
+        "sortlists",
        "sortmaps",
        "sourcetype",
        "STARTTLS",
@@ -19,7 +19,7 @@ if [  -d "./../parsedmarc-docs" ]; then
 fi
 cd ..
 cd parsedmarc/resources/maps
-python3 sortmaps.py
+python3 sortlists.py
 echo "Checking for invalid UTF-8 bytes in base_reverse_dns_map.csv"
 python3 find_bad_utf8.py base_reverse_dns_map.csv
 cd ../../..
@@ -0,0 +1,44 @@
+Agriculture
+Automotive
+Beauty
+Conglomerate
+Construction
+Consulting
+Defense
+Education
+Email Provider
+Email Security
+Entertainment
+Event Planning
+Finance
+Food
+Government
+Government Media
+Healthcare
+ISP
+IaaS
+Industrial
+Legal
+Logistics
+MSP
+MSSP
+Manufacturing
+Marketing
+News
+Nonprofit
+PaaS
+Photography
+Physical Security
+Print
+Publishing
+Real Estate
+Retail
+SaaS
+Science
+Search Engine
+Social Media
+Sports
+Staffing
+Technology
+Travel
+Web Host
@@ -15,22 +15,14 @@ def _main():

    output_rows = []

-    for p in [
-        input_csv_file_path,
-        base_reverse_dns_map_file_path,
-        known_unknown_list_file_path,
-        psl_overrides_file_path,
-    ]:
-        if not os.path.exists(p):
-            print(f"Error: {p} does not exist")
-            exit(1)
-
    known_unknown_domains = []
    psl_overrides = []
    known_domains = []
    output_rows = []

    def load_list(file_path, list_var):
+        if not os.path.exists(file_path):
+            print(f"Error: {file_path} does not exist")
        print(f"Loading {file_path}")
        list_var = []
        with open(file_path) as f:
@@ -44,9 +36,11 @@ def _main():

    load_list(known_unknown_list_file_path, known_unknown_domains)
    load_list(psl_overrides_file_path, psl_overrides)
-
-    print(f"Checking domains against {base_reverse_dns_map_file_path}")
+    if not os.path.exists(input_csv_file_path):
+        print(f"Error: {input_csv_file_path} does not exist")
+        exit(1)
    with open(input_csv_file_path) as f:
+        print(f"Checking domains against {base_reverse_dns_map_file_path}")
        for row in csv.DictReader(f):
            domain = row["source_name"].lower().strip()
            if domain == "":
@@ -1,10 +1,12 @@
 185.in-addr.arpa
 190.in-addr.arpa
 200.in-addr.arpa
+444qcuhilla.com
 9services.com
 a7e.ru
 a94434500-blog.com
 abv-10.top
+adcritic.net
 adlucrumnewsletter.com
 admin.corpivensa.gob.ve
 advantageiq.com
@@ -15,6 +17,9 @@ aghories.com
 ai270.net
 albagroup-eg.com
 alchemy.net
+alohabeachcamp.net
+aluminumpipetubing.com
+americanstorageca.com
 anchorfundhub.com
 anglishment.com
 anteldata.net.uy
@@ -32,6 +37,7 @@ aurelienvos.com
 automatech.lat
 avistaadvantage.com
 b8sales.com
+banaras.co
 bearandbullmarketnews.com
 bestinvestingtime.com
 biocorp.com
@@ -42,14 +48,17 @@ bluhosting.com
 bodiax.pp.ua
 bost-law.com
 brainity.com
+brazalnde.net
 brnonet.cz
 brushinglegal.de
 brw.net
 budgeteasehub.com
 buoytoys.com
+c53dw7m24rj.com
 cashflowmasterypro.com
 cavabeen.com
 cbti.net
+chauffeurplan.co.uk
 checkpox.fun
 chegouseuvlache.org
 christus.mx
@@ -63,12 +72,16 @@ cnode.io
 code-it.net
 colombiaceropapel.org
 commerceinsurance.com
+comsharempc.com
 coolblaze.com
 coowo.com
 corpemail.net
 cp2-myorderbox.com
 cps.com.ar
 ctla.co.kr
+cumbalikonakhotel.com
+currencyexconverter.com
+daakbabu.com
 dastans.ru
 datahost36.de
 descarca-counter-strike.net
@@ -78,6 +91,7 @@ dinofelis.cn
 diwkyncbi.top
 dkginternet.com
 dns-oid.com
+domainserver.ne.jp
 domconfig.com
 doorsrv.com
 dreampox.fun
@@ -86,6 +100,7 @@ ds.network
 dvj.theworkpc.com
 dwlcka.com
 dyntcorp.com
+easternkingspei.com
 economiceagles.com
 egosimail.com
 emailgids.net
@@ -96,7 +111,9 @@ erestaff.com
 example.com
 exposervers.com-new
 eyecandyhosting.xyz
+feipnghardware.com
 fetscorp.shop
+fewo-usedom.net
 fin-crime.com
 financeaimpoint.com
 financeupward.com
@@ -104,19 +121,27 @@ flex-video.bnr.la
 formicidaehunt.net
 fosterheap.com
 frontiernet.net
+ftifb7tk3c.com
 gendns.com
+getgreencardsfast.com
 getthatroi.com
 gigidea.net
 giize.com
 ginous.eu.com
 gist-th.com
+goldsboroughplace.com
 gophermedia.com
 gqlists.us.com
 gratzl.de
 greatestworldnews.com
 greennutritioncare.com
 h-serv.co.uk
+haedefpartners.com
+halcyon-aboveboard.com
+hanzubon.org
 hgnbroken.us.com
+hopsinthehanger.com
+hostelsucre.com
 hosting1337.com
 hostinglotus.cloud
 hostingmichigan.com
@@ -137,20 +162,25 @@ idealconcept.live
 igppevents.org.uk
 imjtmn.cn
 immenzaces.com
+indulgent-holistic.com
 inshaaegypt.com
 ip-147-135-108.us
 ip-178-33-109.eu
 ip-ptr.tech
+iswhatpercent.com
 itsidc.com
 itwebs.com
 ivol.co
 jalanet.co.id
 jimishare.com
+jlenterprises.co.uk
+joyomokei.com
 jumanra.org
 kahlaa.com
 kbronet.com.tw
 kdnursing.org
 kihy.theworkpc.com
+kingschurchwirral.org
 kitchenaildbd.com
 layerdns.cloud
 legenditds.com
@@ -159,6 +189,7 @@ listertermoformadoa.com
 llsend.com
 lohkal.com
 lonestarmm.net
+longmarquis.com
 longwoodmgmt.com
 lwl-puehringer.at
 lynx.net.lb
@@ -173,7 +204,10 @@ matroguel.cam
 maximpactipo.com
 mechanicalwalk.store
 mediavobis.com
+mindworksunlimited.com
+mirth-gale.com
 misorpresa.com
+mjinn.com
 moderntradingnews.com
 moonjaws.com
 morningnewscatcher.com
@@ -182,8 +216,10 @@ mschosting.com
 msdp1.com
 mspnet.pro
 mts-nn.ru
+multifamilydesign.com
 mxserver.ro
 mxthunder.net
+my-ihor.ru
 myrewards.net
 mysagestore.com
 mysecurewebserver.com
@@ -202,67 +238,84 @@ newwallstreetcode.com
 ngvcv.cn
 nic.name
 nidix.net
+nieuwedagnetwerk.net
 nlscanme.com
 nmeuh.cn
 noisndametal.com
 nwo.giize.com
+nwwhalewatchers.org
 offerslatedeals.com
 office365.us
 ogicom.net
+olivettilexikon.co.uk
 omegabrasil.inf.br
 onnet21.com
 oppt-ac.fit
 orbitel.net.co
+outsidences.com
 ovaltinalization.co
 overta.ru
+ox28vgrurc.com
 panaltyspot.space
 passionatesmiles.com
+paulinelam.com
 perimetercenter.net
 permanentscreen.com
 phdns3.es
+pigelixval1.com
 planethoster.net
 plesk.page
 pmnhost.net
+pokiloandhu.net
 pokupki5.ru
 popiup.com
 ports.net
 prima.com.ar
 prima.net.ar
+profsol.co.uk
 prohealthmotion.com
 proudserver.com
 psnm.ru
 pvcwindowsprices.live
 qontenciplc.autos
 quatthonggiotico.com
+qxyxab44njd.com
 rapidns.com
 raxa.host
 reliablepanel.com
 rgb365.eu
 riddlecamera.net
+riddletrends.com
+runnin-rebels.com
 rwdhosting.ca
 s500host.com
 sahacker-2020.com
 samsales.site
 saransk.ru
 satirogluet.com
+scioncontacts.com
 seaspraymta3.net
 secorp.mx
 securen.net
 securerelay.in
 securev.net
 servershost.biz
+shopfox.ca
 silvestrejaguar.sbs
 silvestreonca.sbs
+simplediagnostics.org
 siriuscloud.jp
 sisglobalresearch.com
 smallvillages.com
 smartape-vps.com
 solusoftware.com
+southcoastwebhosting12.com
 spiritualtechnologies.io
 sprout.org
 stableserver.net
 stockepictigers.com
 stockexchangejournal.com
+subterranean-concave.com
 suksangroup.com
 sysop4.com
 system.eu.com
@@ -276,8 +329,10 @@ thaicloudsolutions.com
 thaimonster.com
 thepushcase.com
 totaal.net
+tqh.ro
 traderlearningcenter.com
 tullostrucking.com
+turbinetrends.com
 ultragate.com
 unite.services
 urawasl.com
@@ -294,8 +349,10 @@ web-login.eu
 weblinkinternational.com
 webnox.io
 welllivinghive.com
+whoflew.com
 wisdomhard.com
 wisewealthcircle.com
+wodeniowa.com
 wsiph2.com
 xnt.mx
 xpnuf.cn
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import os
+import csv
+from pathlib import Path
+from typing import Mapping, Iterable, Optional, Collection, Union, List, Dict
+
+
+class CSVValidationError(Exception):
+    def __init__(self, errors: list[str]):
+        super().__init__("\n".join(errors))
+        self.errors = errors
+
+
+def sort_csv(
+    filepath: Union[str, Path],
+    field: str,
+    *,
+    sort_field_value_must_be_unique: bool = True,
+    strip_whitespace: bool = True,
+    fields_to_lowercase: Optional[Iterable[str]] = None,
+    case_insensitive_sort: bool = False,
+    required_fields: Optional[Iterable[str]] = None,
+    allowed_values: Optional[Mapping[str, Collection[str]]] = None,
+) -> List[Dict[str, str]]:
+    """
+    Read a CSV, optionally normalize rows (strip whitespace, lowercase certain fields),
+    validate field values, and write the sorted CSV back to the same path.
+
+    - filepath: Path to the CSV to sort.
+    - field: The field name to sort by.
+    - fields_to_lowercase: Permanently lowercases these field(s) in the data.
+    - strip_whitespace: Remove all whitespace at the beginning and of field values.
+    - case_insensitive_sort: Ignore case when sorting without changing values.
+    - required_fields: A list of fields that must have data in all rows.
+    - allowed_values: A mapping of allowed values for fields.
+    """
+    path = Path(filepath)
+    required_fields = set(required_fields or [])
+    lower_set = set(fields_to_lowercase or [])
+    allowed_sets = {k: set(v) for k, v in (allowed_values or {}).items()}
+    if sort_field_value_must_be_unique:
+        seen_sort_field_values = []
+
+    with path.open("r", newline="") as infile:
+        reader = csv.DictReader(infile)
+        fieldnames = reader.fieldnames or []
+        if field not in fieldnames:
+            raise CSVValidationError([f"Missing sort column: {field!r}"])
+        missing_headers = required_fields - set(fieldnames)
+        if missing_headers:
+            raise CSVValidationError(
+                [f"Missing required header(s): {sorted(missing_headers)}"]
+            )
+        rows = list(reader)
+
+    def normalize_row(row: Dict[str, str]) -> None:
+        if strip_whitespace:
+            for k, v in row.items():
+                if isinstance(v, str):
+                    row[k] = v.strip()
+        for fld in lower_set:
+            if fld in row and isinstance(row[fld], str):
+                row[fld] = row[fld].lower()
+
+    def validate_row(
+        row: Dict[str, str], sort_field: str, line_no: int, errors: list[str]
+    ) -> None:
+        if sort_field_value_must_be_unique:
+            if row[sort_field] in seen_sort_field_values:
+                errors.append(f"Line {line_no}: Duplicate row for '{sort_field}'")
+            else:
+                seen_sort_field_values.append(sort_field)
+        for rf in required_fields:
+            val = row.get(rf)
+            if val is None or val == "":
+                errors.append(
+                    f"Line {line_no}: Missing value for required field '{rf}'"
+                )
+        for field, allowed_values in allowed_sets.items():
+            if field in row:
+                val = row[field]
+                if val not in allowed_values:
+                    errors.append(
+                        f"Line {line_no}: '{val}' is not an allowed value for '{field}' "
+                        f"(allowed: {sorted(allowed_values)})"
+                    )
+
+    errors: list[str] = []
+    for idx, row in enumerate(rows, start=2):  # header is line 1
+        normalize_row(row)
+        validate_row(row, field, idx, errors)
+
+    if errors:
+        raise CSVValidationError(errors)
+
+    def sort_key(r: Dict[str, str]):
+        v = r.get(field, "")
+        if isinstance(v, str) and case_insensitive_sort:
+            return v.casefold()
+        return v
+
+    rows.sort(key=sort_key)
+
+    with path.open(filepath, "w", newline="") as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def sort_list_file(
+    filepath: Union[str, Path],
+    *,
+    lowercase: bool = True,
+    strip: bool = True,
+    deduplicate: bool = True,
+    remove_blank_lines: bool = True,
+    ending_newline: bool = True,
+    newline: Optional[str] = "\n",
+):
+    """Read a list from a file, sort it, optionally strip and deduplicate the values,
+    then write that list back to the file.
+
+    - Filepath: The path to the file.
+    - lowercase: Lowercase all values prior to sorting.
+    - remove_blank_lines: Remove any plank lines.
+    - ending_newline: End the file with a newline, even if remove_blank_lines is true.
+    - newline: The newline character to use.
+    """
+    with open(filepath, mode="r", newline=newline) as infile:
+        lines = infile.readlines()
+        for i in range(len(lines)):
+            if lowercase:
+                lines[i] = lines[i].lower()
+            if strip:
+                lines[i] = lines[i].strip()
+        if deduplicate:
+            lines = list(set(lines))
+        if remove_blank_lines:
+            while "" in lines:
+                lines.remove("")
+        lines = sorted(lines)
+        if ending_newline:
+            if lines[-1] != "":
+                lines.append("")
+    with open(filepath, mode="w", newline=newline) as outfile:
+        outfile.write("\n".join(lines))
+
+
+def _main():
+    map_file = "base_reverse_dns_map.csv"
+    map_key = "base_reverse_dns"
+    list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"]
+    types_file = "base_reverse_dns_types.txt"
+
+    with open(types_file) as f:
+        types = f.readlines()
+        while "" in types:
+            types.remove("")
+
+    map_allowed_values = {"Type": types}
+
+    for list_file in list_files:
+        if not os.path.exists(list_file):
+            print(f"Error: {list_file} does not exist")
+            exit(1)
+        sort_list_file(list_file)
+    if not os.path.exists(types_file):
+        print(f"Error: {types_file} does not exist")
+        exit(1)
+    sort_list_file(types_file, lowercase=False)
+    if not os.path.exists(map_file):
+        print(f"Error: {map_file} does not exist")
+        exit(1)
+    try:
+        sort_csv(map_file, map_key, allowed_values=map_allowed_values)
+    except CSVValidationError as e:
+        print(f"{map_file} did not validate: {e}")
+
+
+if __name__ == "__main__":
+    _main()
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import csv
-
-maps_dir = os.path.join(".")
-map_files = ["base_reverse_dns_map.csv"]
-list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"]
-
-
-def sort_csv(filepath, column=0, strip_whitespace=True):
-    with open(filepath, mode="r", newline="\n") as infile:
-        reader = csv.reader(infile)
-        header = next(reader)
-
-        def normalize_row(row):
-            return [
-                field.strip() if strip_whitespace and isinstance(field, str) else field
-                for field in row
-            ]
-
-        rows = [normalize_row(row) for row in reader]
-        sorted_rows = sorted(rows, key=lambda row: row[column])
-        existing_values = []
-        for row in sorted_rows:
-            if row[column] in existing_values:
-                print(f"Warning: {row[column]} is in {filepath} multiple times")
-            else:
-                existing_values.append(row[column])
-
-    with open(filepath, mode="w", newline="\n") as outfile:
-        writer = csv.writer(outfile, lineterminator="\n")
-        writer.writerow(header)
-        writer.writerows(sorted_rows)
-
-
-def sort_list_file(
-    filepath,
-    lowercase=True,
-    strip=True,
-    deduplicate=True,
-    remove_blank_lines=True,
-    ending_newline=True,
-    newline="\n",
-):
-    with open(filepath, mode="r", newline=newline) as infile:
-        lines = infile.readlines()
-        for i in range(len(lines)):
-            if lowercase:
-                lines[i] = lines[i].lower()
-            if strip:
-                lines[i] = lines[i].strip()
-        if deduplicate:
-            lines = list(set(lines))
-        if remove_blank_lines:
-            while "" in lines:
-                lines.remove("")
-        lines = sorted(lines)
-        if ending_newline:
-            if lines[-1] != "":
-                lines.append("")
-    with open(filepath, mode="w", newline=newline) as outfile:
-        outfile.write("\n".join(lines))
-
-
-for csv_file in map_files:
-    sort_csv(os.path.join(maps_dir, csv_file))
-for list_file in list_files:
-    sort_list_file(os.path.join(maps_dir, list_file))