Improve list verification

This commit is contained in:
Sean Whalen
2025-08-19 20:02:55 -04:00
parent 5df152d469
commit 4bbd97dbaa
10 changed files with 2119 additions and 1863 deletions

2
.gitignore vendored
View File

@@ -106,7 +106,7 @@ ENV/
.idea/
# VS Code launch config
.vscode/launch.json
#.vscode/launch.json
# Visual Studio Code settings
#.vscode/

31
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,31 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"name": "sortlists.py",
"type": "debugpy",
"request": "launch",
"program": "sortlists.py",
"cwd": "${workspaceFolder}/parsedmarc/resources/maps",
"console": "integratedTerminal"
},
{
"name": "find_unknown_base_reverse_dns.py",
"type": "debugpy",
"request": "launch",
"program": "find_unknown_base_reverse_dns.py",
"cwd": "${workspaceFolder}/parsedmarc/resources/maps",
"console": "integratedTerminal"
}
]
}

View File

@@ -109,6 +109,7 @@
"setuptools",
"smartquotes",
"SMTPTLS",
"sortlists",
"sortmaps",
"sourcetype",
"STARTTLS",

View File

@@ -19,7 +19,7 @@ if [ -d "./../parsedmarc-docs" ]; then
fi
cd ..
cd parsedmarc/resources/maps
python3 sortmaps.py
python3 sortlists.py
echo "Checking for invalid UTF-8 bytes in base_reverse_dns_map.csv"
python3 find_bad_utf8.py base_reverse_dns_map.csv
cd ../../..

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,44 @@
Agriculture
Automotive
Beauty
Conglomerate
Construction
Consulting
Defense
Education
Email Provider
Email Security
Entertainment
Event Planning
Finance
Food
Government
Government Media
Healthcare
ISP
IaaS
Industrial
Legal
Logistics
MSP
MSSP
Manufacturing
Marketing
News
Nonprofit
PaaS
Photography
Physical Security
Print
Publishing
Real Estate
Retail
SaaS
Science
Search Engine
Social Media
Sports
Staffing
Technology
Travel
Web Host

View File

@@ -15,22 +15,14 @@ def _main():
output_rows = []
for p in [
input_csv_file_path,
base_reverse_dns_map_file_path,
known_unknown_list_file_path,
psl_overrides_file_path,
]:
if not os.path.exists(p):
print(f"Error: {p} does not exist")
exit(1)
known_unknown_domains = []
psl_overrides = []
known_domains = []
output_rows = []
def load_list(file_path, list_var):
if not os.path.exists(file_path):
print(f"Error: {file_path} does not exist")
print(f"Loading {file_path}")
list_var = []
with open(file_path) as f:
@@ -44,9 +36,11 @@ def _main():
load_list(known_unknown_list_file_path, known_unknown_domains)
load_list(psl_overrides_file_path, psl_overrides)
print(f"Checking domains against {base_reverse_dns_map_file_path}")
if not os.path.exists(input_csv_file_path):
print(f"Error: {input_csv_file_path} does not exist")
exit(1)
with open(input_csv_file_path) as f:
print(f"Checking domains against {base_reverse_dns_map_file_path}")
for row in csv.DictReader(f):
domain = row["source_name"].lower().strip()
if domain == "":

View File

@@ -1,10 +1,12 @@
185.in-addr.arpa
190.in-addr.arpa
200.in-addr.arpa
444qcuhilla.com
9services.com
a7e.ru
a94434500-blog.com
abv-10.top
adcritic.net
adlucrumnewsletter.com
admin.corpivensa.gob.ve
advantageiq.com
@@ -15,6 +17,9 @@ aghories.com
ai270.net
albagroup-eg.com
alchemy.net
alohabeachcamp.net
aluminumpipetubing.com
americanstorageca.com
anchorfundhub.com
anglishment.com
anteldata.net.uy
@@ -32,6 +37,7 @@ aurelienvos.com
automatech.lat
avistaadvantage.com
b8sales.com
banaras.co
bearandbullmarketnews.com
bestinvestingtime.com
biocorp.com
@@ -42,14 +48,17 @@ bluhosting.com
bodiax.pp.ua
bost-law.com
brainity.com
brazalnde.net
brnonet.cz
brushinglegal.de
brw.net
budgeteasehub.com
buoytoys.com
c53dw7m24rj.com
cashflowmasterypro.com
cavabeen.com
cbti.net
chauffeurplan.co.uk
checkpox.fun
chegouseuvlache.org
christus.mx
@@ -63,12 +72,16 @@ cnode.io
code-it.net
colombiaceropapel.org
commerceinsurance.com
comsharempc.com
coolblaze.com
coowo.com
corpemail.net
cp2-myorderbox.com
cps.com.ar
ctla.co.kr
cumbalikonakhotel.com
currencyexconverter.com
daakbabu.com
dastans.ru
datahost36.de
descarca-counter-strike.net
@@ -78,6 +91,7 @@ dinofelis.cn
diwkyncbi.top
dkginternet.com
dns-oid.com
domainserver.ne.jp
domconfig.com
doorsrv.com
dreampox.fun
@@ -86,6 +100,7 @@ ds.network
dvj.theworkpc.com
dwlcka.com
dyntcorp.com
easternkingspei.com
economiceagles.com
egosimail.com
emailgids.net
@@ -96,7 +111,9 @@ erestaff.com
example.com
exposervers.com-new
eyecandyhosting.xyz
feipnghardware.com
fetscorp.shop
fewo-usedom.net
fin-crime.com
financeaimpoint.com
financeupward.com
@@ -104,19 +121,27 @@ flex-video.bnr.la
formicidaehunt.net
fosterheap.com
frontiernet.net
ftifb7tk3c.com
gendns.com
getgreencardsfast.com
getthatroi.com
gigidea.net
giize.com
ginous.eu.com
gist-th.com
goldsboroughplace.com
gophermedia.com
gqlists.us.com
gratzl.de
greatestworldnews.com
greennutritioncare.com
h-serv.co.uk
haedefpartners.com
halcyon-aboveboard.com
hanzubon.org
hgnbroken.us.com
hopsinthehanger.com
hostelsucre.com
hosting1337.com
hostinglotus.cloud
hostingmichigan.com
@@ -137,20 +162,25 @@ idealconcept.live
igppevents.org.uk
imjtmn.cn
immenzaces.com
indulgent-holistic.com
inshaaegypt.com
ip-147-135-108.us
ip-178-33-109.eu
ip-ptr.tech
iswhatpercent.com
itsidc.com
itwebs.com
ivol.co
jalanet.co.id
jimishare.com
jlenterprises.co.uk
joyomokei.com
jumanra.org
kahlaa.com
kbronet.com.tw
kdnursing.org
kihy.theworkpc.com
kingschurchwirral.org
kitchenaildbd.com
layerdns.cloud
legenditds.com
@@ -159,6 +189,7 @@ listertermoformadoa.com
llsend.com
lohkal.com
lonestarmm.net
longmarquis.com
longwoodmgmt.com
lwl-puehringer.at
lynx.net.lb
@@ -173,7 +204,10 @@ matroguel.cam
maximpactipo.com
mechanicalwalk.store
mediavobis.com
mindworksunlimited.com
mirth-gale.com
misorpresa.com
mjinn.com
moderntradingnews.com
moonjaws.com
morningnewscatcher.com
@@ -182,8 +216,10 @@ mschosting.com
msdp1.com
mspnet.pro
mts-nn.ru
multifamilydesign.com
mxserver.ro
mxthunder.net
my-ihor.ru
myrewards.net
mysagestore.com
mysecurewebserver.com
@@ -202,67 +238,84 @@ newwallstreetcode.com
ngvcv.cn
nic.name
nidix.net
nieuwedagnetwerk.net
nlscanme.com
nmeuh.cn
noisndametal.com
nwo.giize.com
nwwhalewatchers.org
offerslatedeals.com
office365.us
ogicom.net
olivettilexikon.co.uk
omegabrasil.inf.br
onnet21.com
oppt-ac.fit
orbitel.net.co
outsidences.com
ovaltinalization.co
overta.ru
ox28vgrurc.com
panaltyspot.space
passionatesmiles.com
paulinelam.com
perimetercenter.net
permanentscreen.com
phdns3.es
pigelixval1.com
planethoster.net
plesk.page
pmnhost.net
pokiloandhu.net
pokupki5.ru
popiup.com
ports.net
prima.com.ar
prima.net.ar
profsol.co.uk
prohealthmotion.com
proudserver.com
psnm.ru
pvcwindowsprices.live
qontenciplc.autos
quatthonggiotico.com
qxyxab44njd.com
rapidns.com
raxa.host
reliablepanel.com
rgb365.eu
riddlecamera.net
riddletrends.com
runnin-rebels.com
rwdhosting.ca
s500host.com
sahacker-2020.com
samsales.site
saransk.ru
satirogluet.com
scioncontacts.com
seaspraymta3.net
secorp.mx
securen.net
securerelay.in
securev.net
servershost.biz
shopfox.ca
silvestrejaguar.sbs
silvestreonca.sbs
simplediagnostics.org
siriuscloud.jp
sisglobalresearch.com
smallvillages.com
smartape-vps.com
solusoftware.com
southcoastwebhosting12.com
spiritualtechnologies.io
sprout.org
stableserver.net
stockepictigers.com
stockexchangejournal.com
subterranean-concave.com
suksangroup.com
sysop4.com
system.eu.com
@@ -276,8 +329,10 @@ thaicloudsolutions.com
thaimonster.com
thepushcase.com
totaal.net
tqh.ro
traderlearningcenter.com
tullostrucking.com
turbinetrends.com
ultragate.com
unite.services
urawasl.com
@@ -294,8 +349,10 @@ web-login.eu
weblinkinternational.com
webnox.io
welllivinghive.com
whoflew.com
wisdomhard.com
wisewealthcircle.com
wodeniowa.com
wsiph2.com
xnt.mx
xpnuf.cn

View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
from __future__ import annotations
import os
import csv
from pathlib import Path
from typing import Mapping, Iterable, Optional, Collection, Union, List, Dict
class CSVValidationError(Exception):
def __init__(self, errors: list[str]):
super().__init__("\n".join(errors))
self.errors = errors
def sort_csv(
filepath: Union[str, Path],
field: str,
*,
sort_field_value_must_be_unique: bool = True,
strip_whitespace: bool = True,
fields_to_lowercase: Optional[Iterable[str]] = None,
case_insensitive_sort: bool = False,
required_fields: Optional[Iterable[str]] = None,
allowed_values: Optional[Mapping[str, Collection[str]]] = None,
) -> List[Dict[str, str]]:
"""
Read a CSV, optionally normalize rows (strip whitespace, lowercase certain fields),
validate field values, and write the sorted CSV back to the same path.
- filepath: Path to the CSV to sort.
- field: The field name to sort by.
- fields_to_lowercase: Permanently lowercases these field(s) in the data.
- strip_whitespace: Remove all whitespace at the beginning and of field values.
- case_insensitive_sort: Ignore case when sorting without changing values.
- required_fields: A list of fields that must have data in all rows.
- allowed_values: A mapping of allowed values for fields.
"""
path = Path(filepath)
required_fields = set(required_fields or [])
lower_set = set(fields_to_lowercase or [])
allowed_sets = {k: set(v) for k, v in (allowed_values or {}).items()}
if sort_field_value_must_be_unique:
seen_sort_field_values = []
with path.open("r", newline="") as infile:
reader = csv.DictReader(infile)
fieldnames = reader.fieldnames or []
if field not in fieldnames:
raise CSVValidationError([f"Missing sort column: {field!r}"])
missing_headers = required_fields - set(fieldnames)
if missing_headers:
raise CSVValidationError(
[f"Missing required header(s): {sorted(missing_headers)}"]
)
rows = list(reader)
def normalize_row(row: Dict[str, str]) -> None:
if strip_whitespace:
for k, v in row.items():
if isinstance(v, str):
row[k] = v.strip()
for fld in lower_set:
if fld in row and isinstance(row[fld], str):
row[fld] = row[fld].lower()
def validate_row(
row: Dict[str, str], sort_field: str, line_no: int, errors: list[str]
) -> None:
if sort_field_value_must_be_unique:
if row[sort_field] in seen_sort_field_values:
errors.append(f"Line {line_no}: Duplicate row for '{sort_field}'")
else:
seen_sort_field_values.append(sort_field)
for rf in required_fields:
val = row.get(rf)
if val is None or val == "":
errors.append(
f"Line {line_no}: Missing value for required field '{rf}'"
)
for field, allowed_values in allowed_sets.items():
if field in row:
val = row[field]
if val not in allowed_values:
errors.append(
f"Line {line_no}: '{val}' is not an allowed value for '{field}' "
f"(allowed: {sorted(allowed_values)})"
)
errors: list[str] = []
for idx, row in enumerate(rows, start=2): # header is line 1
normalize_row(row)
validate_row(row, field, idx, errors)
if errors:
raise CSVValidationError(errors)
def sort_key(r: Dict[str, str]):
v = r.get(field, "")
if isinstance(v, str) and case_insensitive_sort:
return v.casefold()
return v
rows.sort(key=sort_key)
with path.open(filepath, "w", newline="") as outfile:
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def sort_list_file(
filepath: Union[str, Path],
*,
lowercase: bool = True,
strip: bool = True,
deduplicate: bool = True,
remove_blank_lines: bool = True,
ending_newline: bool = True,
newline: Optional[str] = "\n",
):
"""Read a list from a file, sort it, optionally strip and deduplicate the values,
then write that list back to the file.
- Filepath: The path to the file.
- lowercase: Lowercase all values prior to sorting.
- remove_blank_lines: Remove any plank lines.
- ending_newline: End the file with a newline, even if remove_blank_lines is true.
- newline: The newline character to use.
"""
with open(filepath, mode="r", newline=newline) as infile:
lines = infile.readlines()
for i in range(len(lines)):
if lowercase:
lines[i] = lines[i].lower()
if strip:
lines[i] = lines[i].strip()
if deduplicate:
lines = list(set(lines))
if remove_blank_lines:
while "" in lines:
lines.remove("")
lines = sorted(lines)
if ending_newline:
if lines[-1] != "":
lines.append("")
with open(filepath, mode="w", newline=newline) as outfile:
outfile.write("\n".join(lines))
def _main():
map_file = "base_reverse_dns_map.csv"
map_key = "base_reverse_dns"
list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"]
types_file = "base_reverse_dns_types.txt"
with open(types_file) as f:
types = f.readlines()
while "" in types:
types.remove("")
map_allowed_values = {"Type": types}
for list_file in list_files:
if not os.path.exists(list_file):
print(f"Error: {list_file} does not exist")
exit(1)
sort_list_file(list_file)
if not os.path.exists(types_file):
print(f"Error: {types_file} does not exist")
exit(1)
sort_list_file(types_file, lowercase=False)
if not os.path.exists(map_file):
print(f"Error: {map_file} does not exist")
exit(1)
try:
sort_csv(map_file, map_key, allowed_values=map_allowed_values)
except CSVValidationError as e:
print(f"{map_file} did not validate: {e}")
if __name__ == "__main__":
_main()

View File

@@ -1,69 +0,0 @@
#!/usr/bin/env python3
import os
import csv
maps_dir = os.path.join(".")
map_files = ["base_reverse_dns_map.csv"]
list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"]
def sort_csv(filepath, column=0, strip_whitespace=True):
with open(filepath, mode="r", newline="\n") as infile:
reader = csv.reader(infile)
header = next(reader)
def normalize_row(row):
return [
field.strip() if strip_whitespace and isinstance(field, str) else field
for field in row
]
rows = [normalize_row(row) for row in reader]
sorted_rows = sorted(rows, key=lambda row: row[column])
existing_values = []
for row in sorted_rows:
if row[column] in existing_values:
print(f"Warning: {row[column]} is in {filepath} multiple times")
else:
existing_values.append(row[column])
with open(filepath, mode="w", newline="\n") as outfile:
writer = csv.writer(outfile, lineterminator="\n")
writer.writerow(header)
writer.writerows(sorted_rows)
def sort_list_file(
filepath,
lowercase=True,
strip=True,
deduplicate=True,
remove_blank_lines=True,
ending_newline=True,
newline="\n",
):
with open(filepath, mode="r", newline=newline) as infile:
lines = infile.readlines()
for i in range(len(lines)):
if lowercase:
lines[i] = lines[i].lower()
if strip:
lines[i] = lines[i].strip()
if deduplicate:
lines = list(set(lines))
if remove_blank_lines:
while "" in lines:
lines.remove("")
lines = sorted(lines)
if ending_newline:
if lines[-1] != "":
lines.append("")
with open(filepath, mode="w", newline=newline) as outfile:
outfile.write("\n".join(lines))
for csv_file in map_files:
sort_csv(os.path.join(maps_dir, csv_file))
for list_file in list_files:
sort_list_file(os.path.join(maps_dir, list_file))