mirror of
https://github.com/domainaware/parsedmarc.git
synced 2026-02-18 15:36:24 +00:00
Compare commits
152 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
110c6e507d | ||
|
|
c8cdd90a1e | ||
|
|
46a62cc10a | ||
|
|
67fe009145 | ||
|
|
e405e8fa53 | ||
|
|
a72d08ceb7 | ||
|
|
2785e3df34 | ||
|
|
f4470a7dd2 | ||
|
|
18b9894a1f | ||
|
|
d1791a97d3 | ||
|
|
47ca6561c1 | ||
|
|
a0e18206ce | ||
|
|
9e4ffdd54c | ||
|
|
434bd49eb3 | ||
|
|
589038d2c9 | ||
|
|
c558224671 | ||
|
|
044aa9e9a0 | ||
|
|
6270468d30 | ||
|
|
832be7cfa3 | ||
|
|
04dd11cf54 | ||
|
|
0b41942916 | ||
|
|
f14a34202f | ||
|
|
daa6653c29 | ||
|
|
45d1093a99 | ||
|
|
c1a757ca29 | ||
|
|
69b9d25a99 | ||
|
|
94d65f979d | ||
|
|
98342ecac6 | ||
|
|
38a3d4eaae | ||
|
|
a05c230152 | ||
|
|
17bdc3a134 | ||
|
|
858be00f22 | ||
|
|
597ca64f9f | ||
|
|
c5dbe2c4dc | ||
|
|
082b3d355f | ||
|
|
2a7ce47bb1 | ||
|
|
9882405d96 | ||
|
|
fce84763b9 | ||
|
|
8a299b8600 | ||
|
|
b4c2b21547 | ||
|
|
865c249437 | ||
|
|
013859f10e | ||
|
|
6d4a31a120 | ||
|
|
45d3dc3b2e | ||
|
|
4bbd97dbaa | ||
|
|
5df152d469 | ||
|
|
d990bef342 | ||
|
|
caf77ca6d4 | ||
|
|
4b3d32c5a6 | ||
|
|
5df5c10f80 | ||
|
|
308d4657ab | ||
|
|
0f74e33094 | ||
|
|
9f339e11f5 | ||
|
|
391e84b717 | ||
|
|
8bf06ce5af | ||
|
|
2b7ae50a27 | ||
|
|
3feb478793 | ||
|
|
01630bb61c | ||
|
|
39347cb244 | ||
|
|
ed25526d59 | ||
|
|
880d7110fe | ||
|
|
d62001f5a4 | ||
|
|
0720bffcb6 | ||
|
|
fecd55a97d | ||
|
|
a121306eed | ||
|
|
980c9c7904 | ||
|
|
963f5d796f | ||
|
|
6532f3571b | ||
|
|
ea878443a8 | ||
|
|
9f6de41958 | ||
|
|
119192701c | ||
|
|
1d650be48a | ||
|
|
a85553fb18 | ||
|
|
5975d8eb21 | ||
|
|
87ae6175f2 | ||
|
|
68b93ed580 | ||
|
|
55508b513b | ||
|
|
71511c0cfc | ||
|
|
7c45812284 | ||
|
|
607a091a5f | ||
|
|
c308bf938c | ||
|
|
918501ccb5 | ||
|
|
036c372ea3 | ||
|
|
a969d83137 | ||
|
|
e299f7d161 | ||
|
|
4c04418dae | ||
|
|
2ca9373ed0 | ||
|
|
961ef6d804 | ||
|
|
573ba1e3e9 | ||
|
|
1d8af3ccff | ||
|
|
8426daa26b | ||
|
|
d1531b86f2 | ||
|
|
8bb046798c | ||
|
|
d64e12548a | ||
|
|
380479cbf1 | ||
|
|
ace21c8084 | ||
|
|
1a1aef21ad | ||
|
|
532dbbdb7e | ||
|
|
45738ae688 | ||
|
|
9d77bd64bc | ||
|
|
140290221d | ||
|
|
187d61b770 | ||
|
|
0443b7365e | ||
|
|
d7b887a835 | ||
|
|
a805733221 | ||
|
|
9552c3ac92 | ||
|
|
5273948be0 | ||
|
|
b51756b8bd | ||
|
|
7fa7c24cb8 | ||
|
|
972237ae7e | ||
|
|
6e5333a342 | ||
|
|
47b074c80b | ||
|
|
a1cfeb3081 | ||
|
|
c7c451b1b1 | ||
|
|
669deb9755 | ||
|
|
446c018920 | ||
|
|
38c6f86973 | ||
|
|
62ccc11925 | ||
|
|
c32ca3cae3 | ||
|
|
010f1f84a7 | ||
|
|
7da57c6382 | ||
|
|
d08e29a306 | ||
|
|
e1e53ad4cb | ||
|
|
4670e9687d | ||
|
|
7f8a2c08cd | ||
|
|
e9c05dd0bf | ||
|
|
9348a474dd | ||
|
|
e0decaba8c | ||
|
|
26a651cded | ||
|
|
bcfcd93fc6 | ||
|
|
54d5ed3543 | ||
|
|
1efbc87e0e | ||
|
|
e78e7f64af | ||
|
|
ad9de65b99 | ||
|
|
b9df12700b | ||
|
|
20843b920f | ||
|
|
e5ae89fedf | ||
|
|
f148cff11c | ||
|
|
4583769e04 | ||
|
|
0ecb80b27c | ||
|
|
b8e62e6d3b | ||
|
|
c67953a2c5 | ||
|
|
27dff4298c | ||
|
|
f2133aacd4 | ||
|
|
31917e58a9 | ||
|
|
bffb98d217 | ||
|
|
1f93b3a7ea | ||
|
|
88debb9729 | ||
|
|
a8a5564780 | ||
|
|
1e26f95b7b | ||
|
|
82b48e4d01 | ||
|
|
617b7c5b4a |
24
.github/workflows/python-tests.yml
vendored
24
.github/workflows/python-tests.yml
vendored
@@ -11,13 +11,26 @@ on:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
services:
|
||||
elasticsearch:
|
||||
image: elasticsearch:8.18.2
|
||||
env:
|
||||
discovery.type: single-node
|
||||
cluster.name: parsedmarc-cluster
|
||||
discovery.seed_hosts: elasticsearch
|
||||
bootstrap.memory_lock: true
|
||||
xpack.security.enabled: false
|
||||
xpack.license.self_generated.type: basic
|
||||
ports:
|
||||
- 9200:9200
|
||||
- 9300:9300
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -29,13 +42,6 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libemail-outlook-message-perl
|
||||
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
|
||||
sudo apt-get install apt-transport-https
|
||||
echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
|
||||
sudo apt-get update && sudo apt-get install elasticsearch
|
||||
sudo sed -i 's/xpack.security.enabled: true/xpack.security.enabled: false/' /etc/elasticsearch/elasticsearch.yml
|
||||
sudo systemctl restart elasticsearch
|
||||
sudo systemctl --no-pager status elasticsearch
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -106,7 +106,7 @@ ENV/
|
||||
.idea/
|
||||
|
||||
# VS Code launch config
|
||||
.vscode/launch.json
|
||||
#.vscode/launch.json
|
||||
|
||||
# Visual Studio Code settings
|
||||
#.vscode/
|
||||
@@ -136,3 +136,12 @@ samples/private
|
||||
|
||||
*.html
|
||||
*.sqlite-journal
|
||||
|
||||
parsedmarc.ini
|
||||
scratch.py
|
||||
|
||||
parsedmarc/resources/maps/base_reverse_dns.csv
|
||||
parsedmarc/resources/maps/unknown_base_reverse_dns.csv
|
||||
parsedmarc/resources/maps/sus_domains.csv
|
||||
parsedmarc/resources/maps/unknown_domains.txt
|
||||
*.bak
|
||||
|
||||
45
.vscode/launch.json
vendored
Normal file
45
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "tests.py",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "tests.py",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "sample",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "parsedmarc.cli",
|
||||
"args": ["samples/private/sample"]
|
||||
},
|
||||
{
|
||||
"name": "sortlists.py",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "sortlists.py",
|
||||
"cwd": "${workspaceFolder}/parsedmarc/resources/maps",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "find_unknown_base_reverse_dns.py",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "find_unknown_base_reverse_dns.py",
|
||||
"cwd": "${workspaceFolder}/parsedmarc/resources/maps",
|
||||
"console": "integratedTerminal"
|
||||
}
|
||||
]
|
||||
}
|
||||
13
.vscode/settings.json
vendored
13
.vscode/settings.json
vendored
@@ -13,6 +13,7 @@
|
||||
"automodule",
|
||||
"backported",
|
||||
"bellsouth",
|
||||
"boto",
|
||||
"brakhane",
|
||||
"Brightmail",
|
||||
"CEST",
|
||||
@@ -36,6 +37,7 @@
|
||||
"expiringdict",
|
||||
"fieldlist",
|
||||
"genindex",
|
||||
"geoip",
|
||||
"geoipupdate",
|
||||
"Geolite",
|
||||
"geolocation",
|
||||
@@ -44,7 +46,10 @@
|
||||
"hostnames",
|
||||
"htpasswd",
|
||||
"httpasswd",
|
||||
"httplib",
|
||||
"IMAP",
|
||||
"imapclient",
|
||||
"infile",
|
||||
"Interaktive",
|
||||
"IPDB",
|
||||
"journalctl",
|
||||
@@ -70,6 +75,7 @@
|
||||
"modindex",
|
||||
"msgconvert",
|
||||
"msgraph",
|
||||
"MSSP",
|
||||
"Munge",
|
||||
"ndjson",
|
||||
"newkey",
|
||||
@@ -79,14 +85,18 @@
|
||||
"nosecureimap",
|
||||
"nosniff",
|
||||
"nwettbewerb",
|
||||
"opensearch",
|
||||
"parsedmarc",
|
||||
"passsword",
|
||||
"Postorius",
|
||||
"premade",
|
||||
"procs",
|
||||
"publicsuffix",
|
||||
"publicsuffixlist",
|
||||
"publixsuffix",
|
||||
"pygelf",
|
||||
"pypy",
|
||||
"pytest",
|
||||
"quickstart",
|
||||
"Reindex",
|
||||
"replyto",
|
||||
@@ -94,10 +104,13 @@
|
||||
"Rollup",
|
||||
"Rpdm",
|
||||
"SAMEORIGIN",
|
||||
"sdist",
|
||||
"Servernameone",
|
||||
"setuptools",
|
||||
"smartquotes",
|
||||
"SMTPTLS",
|
||||
"sortlists",
|
||||
"sortmaps",
|
||||
"sourcetype",
|
||||
"STARTTLS",
|
||||
"tasklist",
|
||||
|
||||
106
CHANGELOG.md
106
CHANGELOG.md
@@ -1,6 +1,110 @@
|
||||
Changelog
|
||||
=========
|
||||
|
||||
9.0.0
|
||||
------
|
||||
|
||||
- Normalize aggregate DMARC report volumes when a report timespan exceeds 24 hours
|
||||
|
||||
8.19.1
|
||||
------
|
||||
|
||||
- Ignore HTML content type in report email parsing (#626)
|
||||
|
||||
8.19.0
|
||||
------
|
||||
|
||||
- Add multi-tenant support via an index-prefix domain mapping file
|
||||
- PSL overrides so that services like AWS are correctly identified
|
||||
- Additional improvements to report type detection
|
||||
- Fix webhook timeout parsing (PR #623)
|
||||
- Output to STDOUT when the new general config boolean `silent` is set to `False` (Close #614)
|
||||
- Additional services added to `base_reverse_dns_map.csv`
|
||||
|
||||
8.18.9
|
||||
------
|
||||
|
||||
- Complete fix for #687 and more robust report type detection
|
||||
|
||||
8.18.8
|
||||
------
|
||||
|
||||
- Fix parsing emails with an uncompressed aggregate report attachment (Closes #607)
|
||||
- Add `--no-prettify-json` CLI option (PR #617)
|
||||
|
||||
8.18.7
|
||||
------
|
||||
|
||||
Removed improper spaces from `base_reverse_dns_map.csv` (Closes #612)
|
||||
|
||||
8.18.6
|
||||
------
|
||||
|
||||
- Fix since option to correctly work with weeks (PR #604)
|
||||
- Add 183 entries to `base_reverse_dns_map.csv`
|
||||
- Add 57 entries to `known_unknown_base_reverse_dns.txt`
|
||||
- Check for invalid UTF-8 bytes in `base_reverse_dns_map.csv` at build
|
||||
- Exclude unneeded items from the `parsedmarc.resources` module at build
|
||||
|
||||
8.18.5
|
||||
------
|
||||
|
||||
- Fix CSV download
|
||||
|
||||
8.18.4
|
||||
------
|
||||
|
||||
- Fix webhooks
|
||||
|
||||
8.18.3
|
||||
------
|
||||
|
||||
- Move `__version__` to `parsedmarc.constants`
|
||||
- Create a constant `USER_AGENT`
|
||||
- Use the HTTP `User-Agent` header value `parsedmarc/version` for all HTTP requests
|
||||
|
||||
8.18.2
|
||||
------
|
||||
|
||||
- Merged PR #603
|
||||
- Fixes issue #595 - CI test fails for Elasticsearch
|
||||
- Moved Elasticsearch to a separate Docker service container for CI testing
|
||||
- Dropped Python 3.8 from CI testing
|
||||
- Fixes lookup and saving of DMARC forensic reports in Elasticsearch and OpenSearch
|
||||
- Updated fallback `base_reverse_dns_map.csv`, which now includes over 1,400 lines
|
||||
- Updated included `dbip-country-lite.mmdb` to the June 2025 release
|
||||
- Automatically fall back to the internal `base_reverse_dns_map.csv` if the received file is not valid (Fixes #602)
|
||||
- Print the received data to the debug log
|
||||
|
||||
8.18.1
|
||||
------
|
||||
|
||||
- Add missing `https://` to the default Microsoft Graph URL
|
||||
|
||||
8.18.0
|
||||
------
|
||||
|
||||
- Add support for Microsoft national clouds via Graph API base URL (PR #590)
|
||||
- Avoid stopping processing when an invalid DMARC report is encountered (PR #587)
|
||||
- Increase `http.client._MAXHEADERS` from `100` to `200` to avoid errors connecting to Elasticsearch/OpenSearch (PR #589)
|
||||
|
||||
8.17.0
|
||||
------
|
||||
|
||||
- Ignore duplicate aggregate DMARC reports with the same `org_name` and `report_id` seen within the same hour (Fixes #535)
|
||||
- Fix saving SMTP TLS reports to OpenSearch (PR #585 closed issue #576)
|
||||
- Add 303 entries to `base_reverse_dns_map.csv`
|
||||
|
||||
8.16.1
|
||||
------
|
||||
|
||||
- Failed attempt to ignore aggregate DMARC reports seen within a period of one hour (#535)
|
||||
|
||||
8.16.0
|
||||
------
|
||||
|
||||
- Add a `since` option to only search for emails since a certain time (PR #527)
|
||||
|
||||
8.15.4
|
||||
------
|
||||
|
||||
@@ -634,7 +738,7 @@ in the ``elasticsearch`` configuration file section (closes issue #78)
|
||||
-----
|
||||
|
||||
- Add filename and line number to logging output
|
||||
- Improved IMAP error handling
|
||||
- Improved IMAP error handling
|
||||
- Add CLI options
|
||||
|
||||
```text
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE=python:3.9-slim
|
||||
ARG BASE_IMAGE=python:3.13-slim
|
||||
ARG USERNAME=parsedmarc
|
||||
ARG USER_UID=1000
|
||||
ARG USER_GID=$USER_UID
|
||||
|
||||
12
README.md
12
README.md
@@ -9,7 +9,7 @@ Package](https://img.shields.io/pypi/v/parsedmarc.svg)](https://pypi.org/project
|
||||
[](https://pypistats.org/packages/parsedmarc)
|
||||
|
||||
<p align="center">
|
||||
<img src="https://github.com/domainaware/parsedmarc/raw/master/docs/source/_static/screenshots/dmarc-summary-charts.png?raw=true" alt="A screenshot of DMARC summary charts in Kibana"/>
|
||||
<img src="https://raw.githubusercontent.com/domainaware/parsedmarc/refs/heads/master/docs/source/_static/screenshots/dmarc-summary-charts.png?raw=true" alt="A screenshot of DMARC summary charts in Kibana"/>
|
||||
</p>
|
||||
|
||||
`parsedmarc` is a Python module and CLI utility for parsing DMARC
|
||||
@@ -34,14 +34,14 @@ Thanks to all
|
||||
|
||||
## Features
|
||||
|
||||
- Parses draft and 1.0 standard aggregate/rua reports
|
||||
- Parses forensic/failure/ruf reports
|
||||
- Can parse reports from an inbox over IMAP, Microsoft Graph, or Gmail
|
||||
API
|
||||
- Parses draft and 1.0 standard aggregate/rua DMARC reports
|
||||
- Parses forensic/failure/ruf DMARC reports
|
||||
- Parses reports from SMTP TLS Reporting
|
||||
- Can parse reports from an inbox over IMAP, Microsoft Graph, or Gmail API
|
||||
- Transparently handles gzip or zip compressed reports
|
||||
- Consistent data structures
|
||||
- Simple JSON and/or CSV output
|
||||
- Optionally email the results
|
||||
- Optionally send the results to Elasticsearch and/or Splunk, for use
|
||||
- Optionally send the results to Elasticsearch, Opensearch, and/or Splunk, for use
|
||||
with premade dashboards
|
||||
- Optionally send reports to Apache Kafka
|
||||
|
||||
11
build.sh
11
build.sh
@@ -14,8 +14,15 @@ cd docs
|
||||
make clean
|
||||
make html
|
||||
touch build/html/.nojekyll
|
||||
cp -rf build/html/* ../../parsedmarc-docs/
|
||||
if [ -d "./../parsedmarc-docs" ]; then
|
||||
cp -rf build/html/* ../../parsedmarc-docs/
|
||||
fi
|
||||
cd ..
|
||||
cd parsedmarc/resources/maps
|
||||
python3 sortlists.py
|
||||
echo "Checking for invalid UTF-8 bytes in base_reverse_dns_map.csv"
|
||||
python3 find_bad_utf8.py base_reverse_dns_map.csv
|
||||
cd ../../..
|
||||
python3 tests.py
|
||||
rm -rf dist/ build/
|
||||
hatch build
|
||||
hatch build
|
||||
|
||||
@@ -28,3 +28,30 @@ services:
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 24
|
||||
|
||||
opensearch:
|
||||
image: opensearchproject/opensearch:2.18.0
|
||||
environment:
|
||||
- network.host=127.0.0.1
|
||||
- http.host=0.0.0.0
|
||||
- node.name=opensearch
|
||||
- discovery.type=single-node
|
||||
- cluster.name=parsedmarc-cluster
|
||||
- discovery.seed_hosts=opensearch
|
||||
- bootstrap.memory_lock=true
|
||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
|
||||
ports:
|
||||
- 127.0.0.1:9201:9200
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
healthcheck:
|
||||
test:
|
||||
[
|
||||
"CMD-SHELL",
|
||||
"curl -s -XGET http://localhost:9201/_cluster/health?pretty | grep status | grep -q '\\(green\\|yellow\\)'"
|
||||
]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 24
|
||||
|
||||
@@ -21,7 +21,6 @@
|
||||
:members:
|
||||
```
|
||||
|
||||
|
||||
## parsedmarc.splunk
|
||||
|
||||
```{eval-rst}
|
||||
|
||||
@@ -33,15 +33,16 @@ and Valimail.
|
||||
|
||||
## Features
|
||||
|
||||
- Parses draft and 1.0 standard aggregate/rua reports
|
||||
- Parses forensic/failure/ruf reports
|
||||
- Parses draft and 1.0 standard aggregate/rua DMARC reports
|
||||
- Parses forensic/failure/ruf DMARC reports
|
||||
- Parses reports from SMTP TLS Reporting
|
||||
- Can parse reports from an inbox over IMAP, Microsoft Graph, or Gmail API
|
||||
- Transparently handles gzip or zip compressed reports
|
||||
- Consistent data structures
|
||||
- Simple JSON and/or CSV output
|
||||
- Optionally email the results
|
||||
- Optionally send the results to Elasticsearch/OpenSearch and/or Splunk, for use with
|
||||
premade dashboards
|
||||
- Optionally send the results to Elasticsearch, Opensearch, and/or Splunk, for use
|
||||
with premade dashboards
|
||||
- Optionally send reports to Apache Kafka
|
||||
|
||||
```{toctree}
|
||||
|
||||
@@ -23,6 +23,8 @@ of the report schema.
|
||||
"report_id": "9391651994964116463",
|
||||
"begin_date": "2012-04-27 20:00:00",
|
||||
"end_date": "2012-04-28 19:59:59",
|
||||
"timespan_requires_normalization": false,
|
||||
"original_timespan_seconds": 86399,
|
||||
"errors": []
|
||||
},
|
||||
"policy_published": {
|
||||
@@ -39,8 +41,10 @@ of the report schema.
|
||||
"source": {
|
||||
"ip_address": "72.150.241.94",
|
||||
"country": "US",
|
||||
"reverse_dns": "adsl-72-150-241-94.shv.bellsouth.net",
|
||||
"base_domain": "bellsouth.net"
|
||||
"reverse_dns": null,
|
||||
"base_domain": null,
|
||||
"name": null,
|
||||
"type": null
|
||||
},
|
||||
"count": 2,
|
||||
"alignment": {
|
||||
@@ -74,7 +78,10 @@ of the report schema.
|
||||
"result": "pass"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"normalized_timespan": false,
|
||||
"interval_begin": "2012-04-28 00:00:00",
|
||||
"interval_end": "2012-04-28 23:59:59"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -83,8 +90,10 @@ of the report schema.
|
||||
### CSV aggregate report
|
||||
|
||||
```text
|
||||
xml_schema,org_name,org_email,org_extra_contact_info,report_id,begin_date,end_date,errors,domain,adkim,aspf,p,sp,pct,fo,source_ip_address,source_country,source_reverse_dns,source_base_domain,count,spf_aligned,dkim_aligned,dmarc_aligned,disposition,policy_override_reasons,policy_override_comments,envelope_from,header_from,envelope_to,dkim_domains,dkim_selectors,dkim_results,spf_domains,spf_scopes,spf_results
|
||||
draft,acme.com,noreply-dmarc-support@acme.com,http://acme.com/dmarc/support,9391651994964116463,2012-04-27 20:00:00,2012-04-28 19:59:59,,example.com,r,r,none,none,100,0,72.150.241.94,US,adsl-72-150-241-94.shv.bellsouth.net,bellsouth.net,2,True,False,True,none,,,example.com,example.com,,example.com,none,fail,example.com,mfrom,pass
|
||||
xml_schema,org_name,org_email,org_extra_contact_info,report_id,begin_date,end_date,normalized_timespan,errors,domain,adkim,aspf,p,sp,pct,fo,source_ip_address,source_country,source_reverse_dns,source_base_domain,source_name,source_type,count,spf_aligned,dkim_aligned,dmarc_aligned,disposition,policy_override_reasons,policy_override_comments,envelope_from,header_from,envelope_to,dkim_domains,dkim_selectors,dkim_results,spf_domains,spf_scopes,spf_results
|
||||
draft,acme.com,noreply-dmarc-support@acme.com,http://acme.com/dmarc/support,9391651994964116463,2012-04-28 00:00:00,2012-04-28 23:59:59,False,,example.com,r,r,none,none,100,0,72.150.241.94,US,,,,,2,True,False,True,none,,,example.com,example.com,,example.com,none,fail,example.com,mfrom,pass
|
||||
draft,acme.com,noreply-dmarc-support@acme.com,http://acme.com/dmarc/support,9391651994964116463,2012-04-28 00:00:00,2012-04-28 23:59:59,False,,example.com,r,r,none,none,100,0,72.150.241.94,US,,,,,2,True,False,True,none,,,example.com,example.com,,example.com,none,fail,example.com,mfrom,pass
|
||||
|
||||
```
|
||||
|
||||
## Sample forensic report output
|
||||
|
||||
@@ -4,47 +4,50 @@
|
||||
|
||||
```text
|
||||
usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] [-o OUTPUT]
|
||||
[--aggregate-json-filename AGGREGATE_JSON_FILENAME]
|
||||
[--forensic-json-filename FORENSIC_JSON_FILENAME]
|
||||
[--aggregate-csv-filename AGGREGATE_CSV_FILENAME]
|
||||
[--forensic-csv-filename FORENSIC_CSV_FILENAME]
|
||||
[-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline]
|
||||
[-s] [--verbose] [--debug] [--log-file LOG_FILE] [-v]
|
||||
[file_path ...]
|
||||
[--aggregate-json-filename AGGREGATE_JSON_FILENAME] [--forensic-json-filename FORENSIC_JSON_FILENAME]
|
||||
[--smtp-tls-json-filename SMTP_TLS_JSON_FILENAME] [--aggregate-csv-filename AGGREGATE_CSV_FILENAME]
|
||||
[--forensic-csv-filename FORENSIC_CSV_FILENAME] [--smtp-tls-csv-filename SMTP_TLS_CSV_FILENAME]
|
||||
[-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline] [-s] [-w] [--verbose] [--debug]
|
||||
[--log-file LOG_FILE] [--no-prettify-json] [-v]
|
||||
[file_path ...]
|
||||
|
||||
Parses DMARC reports
|
||||
Parses DMARC reports
|
||||
|
||||
positional arguments:
|
||||
file_path one or more paths to aggregate or forensic report
|
||||
files, emails, or mbox files'
|
||||
positional arguments:
|
||||
file_path one or more paths to aggregate or forensic report files, emails, or mbox files'
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-c CONFIG_FILE, --config-file CONFIG_FILE
|
||||
a path to a configuration file (--silent implied)
|
||||
--strip-attachment-payloads
|
||||
remove attachment payloads from forensic report output
|
||||
-o OUTPUT, --output OUTPUT
|
||||
write output files to the given directory
|
||||
--aggregate-json-filename AGGREGATE_JSON_FILENAME
|
||||
filename for the aggregate JSON output file
|
||||
--forensic-json-filename FORENSIC_JSON_FILENAME
|
||||
filename for the forensic JSON output file
|
||||
--aggregate-csv-filename AGGREGATE_CSV_FILENAME
|
||||
filename for the aggregate CSV output file
|
||||
--forensic-csv-filename FORENSIC_CSV_FILENAME
|
||||
filename for the forensic CSV output file
|
||||
-n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...]
|
||||
nameservers to query
|
||||
-t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT
|
||||
number of seconds to wait for an answer from DNS
|
||||
(default: 2.0)
|
||||
--offline do not make online queries for geolocation or DNS
|
||||
-s, --silent only print errors and warnings
|
||||
--verbose more verbose output
|
||||
--debug print debugging information
|
||||
--log-file LOG_FILE output logging to a file
|
||||
-v, --version show program's version number and exit
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-c CONFIG_FILE, --config-file CONFIG_FILE
|
||||
a path to a configuration file (--silent implied)
|
||||
--strip-attachment-payloads
|
||||
remove attachment payloads from forensic report output
|
||||
-o OUTPUT, --output OUTPUT
|
||||
write output files to the given directory
|
||||
--aggregate-json-filename AGGREGATE_JSON_FILENAME
|
||||
filename for the aggregate JSON output file
|
||||
--forensic-json-filename FORENSIC_JSON_FILENAME
|
||||
filename for the forensic JSON output file
|
||||
--smtp-tls-json-filename SMTP_TLS_JSON_FILENAME
|
||||
filename for the SMTP TLS JSON output file
|
||||
--aggregate-csv-filename AGGREGATE_CSV_FILENAME
|
||||
filename for the aggregate CSV output file
|
||||
--forensic-csv-filename FORENSIC_CSV_FILENAME
|
||||
filename for the forensic CSV output file
|
||||
--smtp-tls-csv-filename SMTP_TLS_CSV_FILENAME
|
||||
filename for the SMTP TLS CSV output file
|
||||
-n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...]
|
||||
nameservers to query
|
||||
-t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT
|
||||
number of seconds to wait for an answer from DNS (default: 2.0)
|
||||
--offline do not make online queries for geolocation or DNS
|
||||
-s, --silent only print errors
|
||||
-w, --warnings print warnings in addition to errors
|
||||
--verbose more verbose output
|
||||
--debug print debugging information
|
||||
--log-file LOG_FILE output logging to a file
|
||||
--no-prettify-json output JSON in a single line without indentation
|
||||
-v, --version show program's version number and exit
|
||||
```
|
||||
|
||||
:::{note}
|
||||
@@ -120,8 +123,10 @@ The full set of configuration options are:
|
||||
Elasticsearch, Splunk and/or S3
|
||||
- `save_smtp_tls` - bool: Save SMTP-STS report data to
|
||||
Elasticsearch, Splunk and/or S3
|
||||
- `index_prefix_domain_map` - bool: A path mapping of Opensearch/Elasticsearch index prefixes to domain names
|
||||
- `strip_attachment_payloads` - bool: Remove attachment
|
||||
payloads from results
|
||||
- `silent` - bool: Set this to `False` to output results to STDOUT
|
||||
- `output` - str: Directory to place JSON and CSV files in. This is required if you set either of the JSON output file options.
|
||||
- `aggregate_json_filename` - str: filename for the aggregate
|
||||
JSON output file
|
||||
@@ -208,6 +213,8 @@ The full set of configuration options are:
|
||||
- `mailbox` - str: The mailbox name. This defaults to the
|
||||
current user if using the UsernamePassword auth method, but
|
||||
could be a shared mailbox if the user has access to the mailbox
|
||||
- `graph_url` - str: Microsoft Graph URL. Allows for use of National Clouds (ex Azure Gov)
|
||||
(Default: https://graph.microsoft.com)
|
||||
- `token_file` - str: Path to save the token file
|
||||
(Default: `.token`)
|
||||
- `allow_unencrypted_storage` - bool: Allows the Azure Identity
|
||||
@@ -367,7 +374,7 @@ The full set of configuration options are:
|
||||
- `mode` - str: The GELF transport type to use. Valid modes: `tcp`, `udp`, `tls`
|
||||
|
||||
- `maildir`
|
||||
- `reports_folder` - str: Full path for mailbox maidir location (Default: `INBOX`)
|
||||
- `maildir_path` - str: Full path for mailbox maidir location (Default: `INBOX`)
|
||||
- `maildir_create` - bool: Create maildir if not present (Default: False)
|
||||
|
||||
- `webhook` - Post the individual reports to a webhook url with the report as the JSON body
|
||||
@@ -443,6 +450,28 @@ PUT _cluster/settings
|
||||
Increasing this value increases resource usage.
|
||||
:::
|
||||
|
||||
## Multi-tenant support
|
||||
|
||||
Starting in `8.19.0`, ParseDMARC provides multi-tenant support by placing data into separate OpenSearch or Elasticsearch index prefixes. To set this up, create a YAML file that is formatted where each key is a tenant name, and the value is a list of domains related to that tenant, not including subdomains, like this:
|
||||
|
||||
```yaml
|
||||
example:
|
||||
- example.com
|
||||
- example.net
|
||||
- example.org
|
||||
|
||||
whalensolutions:
|
||||
- whalensolutions.com
|
||||
```
|
||||
|
||||
Save it to disk where the user running ParseDMARC can read it, then set `index_prefix_domain_map` to that filepath in the `[general]` section of the ParseDMARC configuration file and do not set an `index_prefix` option in the `[elasticsearch]` or `[opensearch]` sections.
|
||||
|
||||
When configured correctly, if ParseDMARC finds that a report is related to a domain in the mapping, the report will be saved in an index name that has the tenant name prefixed to it with a trailing underscore. Then, you can use the security features of Opensearch or the ELK stack to only grant users access to the indexes that they need.
|
||||
|
||||
:::{note}
|
||||
A domain cannot be used in multiple tenant lists. Only the first prefix list that contains the matching domain is used.
|
||||
:::
|
||||
|
||||
## Running parsedmarc as a systemd service
|
||||
|
||||
Use systemd to run `parsedmarc` as a service and process reports as
|
||||
|
||||
5901
grafana/Grafana-DMARC_Reports.json-new_panel.json
Normal file
5901
grafana/Grafana-DMARC_Reports.json-new_panel.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@@ -2,6 +2,10 @@
|
||||
|
||||
"""A Python package for parsing DMARC reports"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, List, Any, Union, IO, Callable
|
||||
|
||||
import binascii
|
||||
import email
|
||||
import email.utils
|
||||
@@ -17,9 +21,8 @@ import zlib
|
||||
from base64 import b64decode
|
||||
from collections import OrderedDict
|
||||
from csv import DictWriter
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timedelta, timezone, tzinfo
|
||||
from io import BytesIO, StringIO
|
||||
from typing import Callable
|
||||
|
||||
import mailparser
|
||||
import xmltodict
|
||||
@@ -34,12 +37,13 @@ from parsedmarc.mail import (
|
||||
MSGraphConnection,
|
||||
GmailConnection,
|
||||
)
|
||||
|
||||
from parsedmarc.constants import __version__
|
||||
from parsedmarc.utils import get_base_domain, get_ip_address_info
|
||||
from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
|
||||
from parsedmarc.utils import parse_email
|
||||
from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime
|
||||
|
||||
__version__ = "8.15.4"
|
||||
|
||||
logger.debug("parsedmarc v{0}".format(__version__))
|
||||
|
||||
@@ -54,6 +58,7 @@ MAGIC_XML = b"\x3c\x3f\x78\x6d\x6c\x20"
|
||||
MAGIC_JSON = b"\7b"
|
||||
|
||||
IP_ADDRESS_CACHE = ExpiringDict(max_len=10000, max_age_seconds=14400)
|
||||
SEEN_AGGREGATE_REPORT_IDS = ExpiringDict(max_len=100000000, max_age_seconds=3600)
|
||||
REVERSE_DNS_MAP = dict()
|
||||
|
||||
|
||||
@@ -77,15 +82,196 @@ class InvalidForensicReport(InvalidDMARCReport):
|
||||
"""Raised when an invalid DMARC forensic report is encountered"""
|
||||
|
||||
|
||||
def _bucket_interval_by_day(
|
||||
begin: datetime,
|
||||
end: datetime,
|
||||
total_count: int,
|
||||
) -> List[Dict[Any]]:
|
||||
"""
|
||||
Split the interval [begin, end) into daily buckets and distribute
|
||||
`total_count` proportionally across those buckets.
|
||||
|
||||
The function:
|
||||
1. Identifies each calendar day touched by [begin, end)
|
||||
2. Computes how many seconds of the interval fall into each day
|
||||
3. Assigns counts in proportion to those overlaps
|
||||
4. Ensures the final counts sum exactly to total_count
|
||||
|
||||
Args:
|
||||
begin: timezone-aware datetime, inclusive start of interval
|
||||
end: timezone-aware datetime, exclusive end of interval
|
||||
total_count: number of messages to distribute
|
||||
|
||||
Returns:
|
||||
A list of dicts like:
|
||||
{
|
||||
"begin": datetime,
|
||||
"end": datetime,
|
||||
"count": int
|
||||
}
|
||||
"""
|
||||
# --- Input validation ----------------------------------------------------
|
||||
if begin > end:
|
||||
raise ValueError("begin must be earlier than end")
|
||||
if begin.tzinfo is None or end.tzinfo is None:
|
||||
raise ValueError("begin and end must be timezone-aware")
|
||||
if begin.tzinfo is not end.tzinfo:
|
||||
raise ValueError("begin and end must have the same tzinfo")
|
||||
if total_count < 0:
|
||||
raise ValueError("total_count must be non-negative")
|
||||
|
||||
# --- Short-circuit trivial cases -----------------------------------------
|
||||
interval_seconds = (end - begin).total_seconds()
|
||||
if interval_seconds <= 0 or total_count == 0:
|
||||
return []
|
||||
|
||||
tz: tzinfo = begin.tzinfo
|
||||
|
||||
# --- Step 1: Determine all calendar days touched by [begin, end) ----------
|
||||
#
|
||||
# For example:
|
||||
# begin = Jan 1 12:00
|
||||
# end = Jan 3 06:00
|
||||
#
|
||||
# We need buckets for:
|
||||
# Jan 1 12:00 → Jan 2 00:00
|
||||
# Jan 2 00:00 → Jan 3 00:00
|
||||
# Jan 3 00:00 → Jan 3 06:00
|
||||
#
|
||||
|
||||
# Start at midnight on the day of `begin`.
|
||||
day_cursor = datetime(begin.year, begin.month, begin.day, tzinfo=tz)
|
||||
|
||||
# If `begin` is earlier on that day (e.g. 10:00), we want that midnight.
|
||||
# If `begin` is past that midnight (e.g. 00:30), this is correct.
|
||||
# If `begin` is BEFORE that midnight (rare unless tz shifts), adjust:
|
||||
if day_cursor > begin:
|
||||
day_cursor -= timedelta(days=1)
|
||||
|
||||
day_buckets: List[Dict[str, Any]] = []
|
||||
|
||||
while day_cursor < end:
|
||||
day_start = day_cursor
|
||||
day_end = day_cursor + timedelta(days=1)
|
||||
|
||||
# Overlap between [begin, end) and this day
|
||||
overlap_start = max(begin, day_start)
|
||||
overlap_end = min(end, day_end)
|
||||
|
||||
overlap_seconds = (overlap_end - overlap_start).total_seconds()
|
||||
|
||||
if overlap_seconds > 0:
|
||||
day_buckets.append(
|
||||
{
|
||||
"begin": overlap_start,
|
||||
"end": overlap_end,
|
||||
"seconds": overlap_seconds,
|
||||
}
|
||||
)
|
||||
|
||||
day_cursor = day_end
|
||||
|
||||
# --- Step 2: Pro-rate counts across buckets -------------------------------
|
||||
#
|
||||
# Compute the exact fractional count for each bucket:
|
||||
# bucket_fraction = bucket_seconds / interval_seconds
|
||||
# bucket_exact = total_count * bucket_fraction
|
||||
#
|
||||
# Then apply a "largest remainder" rounding strategy to ensure the sum
|
||||
# equals exactly total_count.
|
||||
|
||||
exact_values: List[float] = [
|
||||
(b["seconds"] / interval_seconds) * total_count for b in day_buckets
|
||||
]
|
||||
|
||||
floor_values: List[int] = [int(x) for x in exact_values]
|
||||
fractional_parts: List[float] = [x - int(x) for x in exact_values]
|
||||
|
||||
# How many counts do we still need to distribute after flooring?
|
||||
remainder = total_count - sum(floor_values)
|
||||
|
||||
# Sort buckets by descending fractional remainder
|
||||
indices_by_fraction = sorted(
|
||||
range(len(day_buckets)),
|
||||
key=lambda i: fractional_parts[i],
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# Start with floor values
|
||||
final_counts = floor_values[:]
|
||||
|
||||
# Add +1 to the buckets with the largest fractional parts
|
||||
for idx in indices_by_fraction[:remainder]:
|
||||
final_counts[idx] += 1
|
||||
|
||||
# --- Step 3: Build the final per-day result list -------------------------
|
||||
results: List[Dict[str, Any]] = []
|
||||
for bucket, count in zip(day_buckets, final_counts):
|
||||
if count > 0:
|
||||
results.append(
|
||||
{
|
||||
"begin": bucket["begin"],
|
||||
"end": bucket["end"],
|
||||
"count": count,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _append_parsed_record(
|
||||
parsed_record: Dict[str, Any],
|
||||
records: List[Dict[str, Any]],
|
||||
begin_dt: datetime,
|
||||
end_dt: datetime,
|
||||
normalize: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Append a parsed DMARC record either unchanged or normalized.
|
||||
|
||||
Args:
|
||||
parsed_record: The record returned by _parse_report_record().
|
||||
records: Accumulating list of output records.
|
||||
begin_dt: Report-level begin datetime (UTC).
|
||||
end_dt: Report-level end datetime (UTC).
|
||||
normalize: Whether this report exceeded the allowed timespan
|
||||
and should be normalized per-day.
|
||||
"""
|
||||
|
||||
if not normalize:
|
||||
parsed_record["normalized_timespan"] = False
|
||||
parsed_record["interval_begin"] = begin_dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
parsed_record["interval_end"] = end_dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
records.append(parsed_record)
|
||||
return
|
||||
|
||||
# Normalization path: break record into daily buckets
|
||||
total_count = int(parsed_record.get("count", 0))
|
||||
buckets = _bucket_interval_by_day(begin_dt, end_dt, total_count)
|
||||
if not buckets:
|
||||
return
|
||||
|
||||
for part_index, bucket in enumerate(buckets):
|
||||
new_rec = parsed_record.copy()
|
||||
new_rec["count"] = bucket["count"]
|
||||
new_rec["normalized_timespan"] = True
|
||||
|
||||
new_rec["interval_begin"] = bucket["begin"].strftime("%Y-%m-%d %H:%M:%S")
|
||||
new_rec["interval_end"] = bucket["end"].strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
records.append(new_rec)
|
||||
|
||||
|
||||
def _parse_report_record(
|
||||
record,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
record: dict,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
):
|
||||
"""
|
||||
Converts a record from a DMARC aggregate report into a more consistent
|
||||
@@ -240,7 +426,7 @@ def _parse_report_record(
|
||||
return new_record
|
||||
|
||||
|
||||
def _parse_smtp_tls_failure_details(failure_details):
|
||||
def _parse_smtp_tls_failure_details(failure_details: dict):
|
||||
try:
|
||||
new_failure_details = OrderedDict(
|
||||
result_type=failure_details["result-type"],
|
||||
@@ -271,19 +457,19 @@ def _parse_smtp_tls_failure_details(failure_details):
|
||||
return new_failure_details
|
||||
|
||||
except KeyError as e:
|
||||
raise InvalidSMTPTLSReport(f"Missing required failure details field:" f" {e}")
|
||||
raise InvalidSMTPTLSReport(f"Missing required failure details field: {e}")
|
||||
except Exception as e:
|
||||
raise InvalidSMTPTLSReport(str(e))
|
||||
|
||||
|
||||
def _parse_smtp_tls_report_policy(policy):
|
||||
def _parse_smtp_tls_report_policy(policy: dict):
|
||||
policy_types = ["tlsa", "sts", "no-policy-found"]
|
||||
try:
|
||||
policy_domain = policy["policy"]["policy-domain"]
|
||||
policy_type = policy["policy"]["policy-type"]
|
||||
failure_details = []
|
||||
if policy_type not in policy_types:
|
||||
raise InvalidSMTPTLSReport(f"Invalid policy type " f"{policy_type}")
|
||||
raise InvalidSMTPTLSReport(f"Invalid policy type {policy_type}")
|
||||
new_policy = OrderedDict(policy_domain=policy_domain, policy_type=policy_type)
|
||||
if "policy-string" in policy["policy"]:
|
||||
if isinstance(policy["policy"]["policy-string"], list):
|
||||
@@ -313,7 +499,7 @@ def _parse_smtp_tls_report_policy(policy):
|
||||
raise InvalidSMTPTLSReport(str(e))
|
||||
|
||||
|
||||
def parse_smtp_tls_report_json(report):
|
||||
def parse_smtp_tls_report_json(report: dict):
|
||||
"""Parses and validates an SMTP TLS report"""
|
||||
required_fields = [
|
||||
"organization-name",
|
||||
@@ -331,9 +517,7 @@ def parse_smtp_tls_report_json(report):
|
||||
raise Exception(f"Missing required field: {required_field}]")
|
||||
if not isinstance(report["policies"], list):
|
||||
policies_type = type(report["policies"])
|
||||
raise InvalidSMTPTLSReport(
|
||||
f"policies must be a list, " f"not {policies_type}"
|
||||
)
|
||||
raise InvalidSMTPTLSReport(f"policies must be a list, not {policies_type}")
|
||||
for policy in report["policies"]:
|
||||
policies.append(_parse_smtp_tls_report_policy(policy))
|
||||
|
||||
@@ -354,7 +538,7 @@ def parse_smtp_tls_report_json(report):
|
||||
raise InvalidSMTPTLSReport(str(e))
|
||||
|
||||
|
||||
def parsed_smtp_tls_reports_to_csv_rows(reports):
|
||||
def parsed_smtp_tls_reports_to_csv_rows(reports: dict):
|
||||
"""Converts one oor more parsed SMTP TLS reports into a list of single
|
||||
layer OrderedDict objects suitable for use in a CSV"""
|
||||
if type(reports) is OrderedDict:
|
||||
@@ -389,7 +573,7 @@ def parsed_smtp_tls_reports_to_csv_rows(reports):
|
||||
return rows
|
||||
|
||||
|
||||
def parsed_smtp_tls_reports_to_csv(reports):
|
||||
def parsed_smtp_tls_reports_to_csv(reports: dict):
|
||||
"""
|
||||
Converts one or more parsed SMTP TLS reports to flat CSV format, including
|
||||
headers
|
||||
@@ -435,15 +619,16 @@ def parsed_smtp_tls_reports_to_csv(reports):
|
||||
|
||||
|
||||
def parse_aggregate_report_xml(
|
||||
xml,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
nameservers=None,
|
||||
timeout=2.0,
|
||||
keep_alive=None,
|
||||
xml: str,
|
||||
ip_db_path: bool = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: bool = None,
|
||||
reverse_dns_map_url: bool = None,
|
||||
offline: bool = False,
|
||||
nameservers: bool = None,
|
||||
timeout: float = 2.0,
|
||||
keep_alive: callable = None,
|
||||
normalize_timespan_threshold_hours: float = 24.0,
|
||||
):
|
||||
"""Parses a DMARC XML report string and returns a consistent OrderedDict
|
||||
|
||||
@@ -458,6 +643,7 @@ def parse_aggregate_report_xml(
|
||||
(Cloudflare's public DNS resolvers by default)
|
||||
timeout (float): Sets the DNS timeout in seconds
|
||||
keep_alive (callable): Keep alive function
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
|
||||
Returns:
|
||||
OrderedDict: The parsed aggregate DMARC report
|
||||
@@ -522,13 +708,27 @@ def parse_aggregate_report_xml(
|
||||
report_id = report_id.replace("<", "").replace(">", "").split("@")[0]
|
||||
new_report_metadata["report_id"] = report_id
|
||||
date_range = report["report_metadata"]["date_range"]
|
||||
if int(date_range["end"]) - int(date_range["begin"]) > 2 * 86400:
|
||||
_error = "Time span > 24 hours - RFC 7489 section 7.2"
|
||||
raise InvalidAggregateReport(_error)
|
||||
date_range["begin"] = timestamp_to_human(date_range["begin"])
|
||||
date_range["end"] = timestamp_to_human(date_range["end"])
|
||||
|
||||
begin_ts = int(date_range["begin"])
|
||||
end_ts = int(date_range["end"])
|
||||
span_seconds = end_ts - begin_ts
|
||||
|
||||
normalize_timespan = span_seconds > normalize_timespan_threshold_hours * 3600
|
||||
|
||||
date_range["begin"] = timestamp_to_human(begin_ts)
|
||||
date_range["end"] = timestamp_to_human(end_ts)
|
||||
|
||||
new_report_metadata["begin_date"] = date_range["begin"]
|
||||
new_report_metadata["end_date"] = date_range["end"]
|
||||
new_report_metadata["timespan_requires_normalization"] = normalize_timespan
|
||||
new_report_metadata["original_timespan_seconds"] = span_seconds
|
||||
begin_dt = human_timestamp_to_datetime(
|
||||
new_report_metadata["begin_date"], to_utc=True
|
||||
)
|
||||
end_dt = human_timestamp_to_datetime(
|
||||
new_report_metadata["end_date"], to_utc=True
|
||||
)
|
||||
|
||||
if "error" in report["report_metadata"]:
|
||||
if not isinstance(report["report_metadata"]["error"], list):
|
||||
errors = [report["report_metadata"]["error"]]
|
||||
@@ -587,7 +787,13 @@ def parse_aggregate_report_xml(
|
||||
nameservers=nameservers,
|
||||
dns_timeout=timeout,
|
||||
)
|
||||
records.append(report_record)
|
||||
_append_parsed_record(
|
||||
parsed_record=report_record,
|
||||
records=records,
|
||||
begin_dt=begin_dt,
|
||||
end_dt=end_dt,
|
||||
normalize=normalize_timespan,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Could not parse record: {0}".format(e))
|
||||
|
||||
@@ -602,7 +808,13 @@ def parse_aggregate_report_xml(
|
||||
nameservers=nameservers,
|
||||
dns_timeout=timeout,
|
||||
)
|
||||
records.append(report_record)
|
||||
_append_parsed_record(
|
||||
parsed_record=report_record,
|
||||
records=records,
|
||||
begin_dt=begin_dt,
|
||||
end_dt=end_dt,
|
||||
normalize=normalize_timespan,
|
||||
)
|
||||
|
||||
new_report["records"] = records
|
||||
|
||||
@@ -620,7 +832,7 @@ def parse_aggregate_report_xml(
|
||||
raise InvalidAggregateReport("Unexpected error: {0}".format(error.__str__()))
|
||||
|
||||
|
||||
def extract_report(content):
|
||||
def extract_report(content: Union[bytes, str, IO[Any]]):
|
||||
"""
|
||||
Extracts text from a zip or gzip file, as a base64-encoded string,
|
||||
file-like object, or bytes.
|
||||
@@ -684,15 +896,16 @@ def extract_report_from_file_path(file_path):
|
||||
|
||||
|
||||
def parse_aggregate_report_file(
|
||||
_input,
|
||||
offline=False,
|
||||
always_use_local_files=None,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
ip_db_path=None,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
keep_alive=None,
|
||||
_input: Union[str, bytes, IO[Any]],
|
||||
offline: bool = False,
|
||||
always_use_local_files: bool = None,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
ip_db_path: str = None,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
keep_alive: Callable = None,
|
||||
normalize_timespan_threshold_hours: float = 24.0,
|
||||
):
|
||||
"""Parses a file at the given path, a file-like object. or bytes as an
|
||||
aggregate DMARC report
|
||||
@@ -708,6 +921,7 @@ def parse_aggregate_report_file(
|
||||
(Cloudflare's public DNS resolvers by default)
|
||||
dns_timeout (float): Sets the DNS timeout in seconds
|
||||
keep_alive (callable): Keep alive function
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
|
||||
Returns:
|
||||
OrderedDict: The parsed DMARC aggregate report
|
||||
@@ -728,10 +942,11 @@ def parse_aggregate_report_file(
|
||||
nameservers=nameservers,
|
||||
timeout=dns_timeout,
|
||||
keep_alive=keep_alive,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
|
||||
|
||||
def parsed_aggregate_reports_to_csv_rows(reports):
|
||||
def parsed_aggregate_reports_to_csv_rows(reports: list[dict]):
|
||||
"""
|
||||
Converts one or more parsed aggregate reports to list of dicts in flat CSV
|
||||
format
|
||||
@@ -760,6 +975,9 @@ def parsed_aggregate_reports_to_csv_rows(reports):
|
||||
report_id = report["report_metadata"]["report_id"]
|
||||
begin_date = report["report_metadata"]["begin_date"]
|
||||
end_date = report["report_metadata"]["end_date"]
|
||||
normalized_timespan = report["report_metadata"][
|
||||
"timespan_requires_normalization"
|
||||
]
|
||||
errors = "|".join(report["report_metadata"]["errors"])
|
||||
domain = report["policy_published"]["domain"]
|
||||
adkim = report["policy_published"]["adkim"]
|
||||
@@ -777,6 +995,7 @@ def parsed_aggregate_reports_to_csv_rows(reports):
|
||||
report_id=report_id,
|
||||
begin_date=begin_date,
|
||||
end_date=end_date,
|
||||
normalized_timespan=normalized_timespan,
|
||||
errors=errors,
|
||||
domain=domain,
|
||||
adkim=adkim,
|
||||
@@ -789,6 +1008,8 @@ def parsed_aggregate_reports_to_csv_rows(reports):
|
||||
|
||||
for record in report["records"]:
|
||||
row = report_dict.copy()
|
||||
row["begin_date"] = record["interval_begin"]
|
||||
row["end_date"] = record["interval_end"]
|
||||
row["source_ip_address"] = record["source"]["ip_address"]
|
||||
row["source_country"] = record["source"]["country"]
|
||||
row["source_reverse_dns"] = record["source"]["reverse_dns"]
|
||||
@@ -849,7 +1070,7 @@ def parsed_aggregate_reports_to_csv_rows(reports):
|
||||
return rows
|
||||
|
||||
|
||||
def parsed_aggregate_reports_to_csv(reports):
|
||||
def parsed_aggregate_reports_to_csv(reports: list[OrderedDict]):
|
||||
"""
|
||||
Converts one or more parsed aggregate reports to flat CSV format, including
|
||||
headers
|
||||
@@ -869,6 +1090,7 @@ def parsed_aggregate_reports_to_csv(reports):
|
||||
"report_id",
|
||||
"begin_date",
|
||||
"end_date",
|
||||
"normalized_timespan",
|
||||
"errors",
|
||||
"domain",
|
||||
"adkim",
|
||||
@@ -915,17 +1137,17 @@ def parsed_aggregate_reports_to_csv(reports):
|
||||
|
||||
|
||||
def parse_forensic_report(
|
||||
feedback_report,
|
||||
sample,
|
||||
msg_date,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
ip_db_path=None,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
strip_attachment_payloads=False,
|
||||
feedback_report: str,
|
||||
sample: str,
|
||||
msg_date: datetime,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
ip_db_path: str = None,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
):
|
||||
"""
|
||||
Converts a DMARC forensic report and sample to a ``OrderedDict``
|
||||
@@ -1054,7 +1276,7 @@ def parse_forensic_report(
|
||||
raise InvalidForensicReport("Unexpected error: {0}".format(error.__str__()))
|
||||
|
||||
|
||||
def parsed_forensic_reports_to_csv_rows(reports):
|
||||
def parsed_forensic_reports_to_csv_rows(reports: list[OrderedDict]):
|
||||
"""
|
||||
Converts one or more parsed forensic reports to a list of dicts in flat CSV
|
||||
format
|
||||
@@ -1090,7 +1312,7 @@ def parsed_forensic_reports_to_csv_rows(reports):
|
||||
return rows
|
||||
|
||||
|
||||
def parsed_forensic_reports_to_csv(reports):
|
||||
def parsed_forensic_reports_to_csv(reports: list[dict]):
|
||||
"""
|
||||
Converts one or more parsed forensic reports to flat CSV format, including
|
||||
headers
|
||||
@@ -1143,16 +1365,17 @@ def parsed_forensic_reports_to_csv(reports):
|
||||
|
||||
|
||||
def parse_report_email(
|
||||
input_,
|
||||
offline=False,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
strip_attachment_payloads=False,
|
||||
keep_alive=None,
|
||||
input_: Union[bytes, str],
|
||||
offline: bool = False,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
keep_alive: callable = None,
|
||||
normalize_timespan_threshold_hours: float = 24.0,
|
||||
):
|
||||
"""
|
||||
Parses a DMARC report from an email
|
||||
@@ -1169,6 +1392,7 @@ def parse_report_email(
|
||||
strip_attachment_payloads (bool): Remove attachment payloads from
|
||||
forensic report results
|
||||
keep_alive (callable): keep alive function
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
|
||||
Returns:
|
||||
OrderedDict:
|
||||
@@ -1184,7 +1408,7 @@ def parse_report_email(
|
||||
input_ = input_.decode(encoding="utf8", errors="replace")
|
||||
msg = mailparser.parse_from_string(input_)
|
||||
msg_headers = json.loads(msg.headers_json)
|
||||
date = email.utils.format_datetime(datetime.utcnow())
|
||||
date = email.utils.format_datetime(datetime.now(timezone.utc))
|
||||
if "Date" in msg_headers:
|
||||
date = human_timestamp_to_datetime(msg_headers["Date"])
|
||||
msg = email.message_from_string(input_)
|
||||
@@ -1200,12 +1424,16 @@ def parse_report_email(
|
||||
if "Subject" in msg_headers:
|
||||
subject = msg_headers["Subject"]
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
content_type = part.get_content_type().lower()
|
||||
payload = part.get_payload()
|
||||
if not isinstance(payload, list):
|
||||
payload = [payload]
|
||||
payload = payload[0].__str__()
|
||||
if content_type == "message/feedback-report":
|
||||
if content_type.startswith("multipart/"):
|
||||
continue
|
||||
if content_type == "text/html":
|
||||
continue
|
||||
elif content_type == "message/feedback-report":
|
||||
try:
|
||||
if "Feedback-Type" in payload:
|
||||
feedback_report = payload
|
||||
@@ -1216,13 +1444,12 @@ def parse_report_email(
|
||||
feedback_report = feedback_report.replace("\\n", "\n")
|
||||
except (ValueError, TypeError, binascii.Error):
|
||||
feedback_report = payload
|
||||
|
||||
elif content_type == "text/rfc822-headers":
|
||||
sample = payload
|
||||
elif content_type == "message/rfc822":
|
||||
sample = payload
|
||||
elif content_type == "application/tlsrpt+json":
|
||||
if "{" not in payload:
|
||||
if not payload.strip().startswith("{"):
|
||||
payload = str(b64decode(payload))
|
||||
smtp_tls_report = parse_smtp_tls_report_json(payload)
|
||||
return OrderedDict(
|
||||
@@ -1234,7 +1461,6 @@ def parse_report_email(
|
||||
return OrderedDict(
|
||||
[("report_type", "smtp_tls"), ("report", smtp_tls_report)]
|
||||
)
|
||||
|
||||
elif content_type == "text/plain":
|
||||
if "A message claiming to be from you has failed" in payload:
|
||||
try:
|
||||
@@ -1245,11 +1471,11 @@ def parse_report_email(
|
||||
field_name = match[0].lower().replace(" ", "-")
|
||||
fields[field_name] = match[1].strip()
|
||||
|
||||
feedback_report = "Arrival-Date: {}\n" "Source-IP: {}" "".format(
|
||||
feedback_report = "Arrival-Date: {}\nSource-IP: {}".format(
|
||||
fields["received-date"], fields["sender-ip-address"]
|
||||
)
|
||||
except Exception as e:
|
||||
error = "Unable to parse message with " 'subject "{0}": {1}'.format(
|
||||
error = 'Unable to parse message with subject "{0}": {1}'.format(
|
||||
subject, e
|
||||
)
|
||||
raise InvalidDMARCReport(error)
|
||||
@@ -1261,13 +1487,14 @@ def parse_report_email(
|
||||
payload = b64decode(payload)
|
||||
if payload.startswith(MAGIC_ZIP) or payload.startswith(MAGIC_GZIP):
|
||||
payload = extract_report(payload)
|
||||
ns = nameservers
|
||||
if payload.startswith("{"):
|
||||
smtp_tls_report = parse_smtp_tls_report_json(payload)
|
||||
result = OrderedDict(
|
||||
[("report_type", "smtp_tls"), ("report", smtp_tls_report)]
|
||||
)
|
||||
return result
|
||||
if isinstance(payload, bytes):
|
||||
payload = payload.decode("utf-8", errors="replace")
|
||||
if payload.strip().startswith("{"):
|
||||
smtp_tls_report = parse_smtp_tls_report_json(payload)
|
||||
result = OrderedDict(
|
||||
[("report_type", "smtp_tls"), ("report", smtp_tls_report)]
|
||||
)
|
||||
elif payload.strip().startswith("<"):
|
||||
aggregate_report = parse_aggregate_report_xml(
|
||||
payload,
|
||||
ip_db_path=ip_db_path,
|
||||
@@ -1275,28 +1502,28 @@ def parse_report_email(
|
||||
reverse_dns_map_path=reverse_dns_map_path,
|
||||
reverse_dns_map_url=reverse_dns_map_url,
|
||||
offline=offline,
|
||||
nameservers=ns,
|
||||
nameservers=nameservers,
|
||||
timeout=dns_timeout,
|
||||
keep_alive=keep_alive,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
result = OrderedDict(
|
||||
[("report_type", "aggregate"), ("report", aggregate_report)]
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except (TypeError, ValueError, binascii.Error):
|
||||
pass
|
||||
|
||||
except InvalidAggregateReport as e:
|
||||
error = (
|
||||
'Message with subject "{0}" '
|
||||
"is not a valid "
|
||||
"aggregate DMARC report: {1}".format(subject, e)
|
||||
except InvalidDMARCReport:
|
||||
error = 'Message with subject "{0}" is not a valid DMARC report'.format(
|
||||
subject
|
||||
)
|
||||
raise ParserError(error)
|
||||
|
||||
except Exception as e:
|
||||
error = "Unable to parse message with " 'subject "{0}": {1}'.format(
|
||||
error = 'Unable to parse message with subject "{0}": {1}'.format(
|
||||
subject, e
|
||||
)
|
||||
raise ParserError(error)
|
||||
@@ -1330,21 +1557,22 @@ def parse_report_email(
|
||||
return result
|
||||
|
||||
if result is None:
|
||||
error = 'Message with subject "{0}" is ' "not a valid report".format(subject)
|
||||
error = 'Message with subject "{0}" is not a valid report'.format(subject)
|
||||
raise InvalidDMARCReport(error)
|
||||
|
||||
|
||||
def parse_report_file(
|
||||
input_,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
strip_attachment_payloads=False,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
keep_alive=None,
|
||||
input_: Union[bytes, str, IO[Any]],
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
keep_alive: Callable = None,
|
||||
normalize_timespan_threshold_hours: float = 24,
|
||||
):
|
||||
"""Parses a DMARC aggregate or forensic file at the given path, a
|
||||
file-like object. or bytes
|
||||
@@ -1387,6 +1615,7 @@ def parse_report_file(
|
||||
nameservers=nameservers,
|
||||
dns_timeout=dns_timeout,
|
||||
keep_alive=keep_alive,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
results = OrderedDict([("report_type", "aggregate"), ("report", report)])
|
||||
except InvalidAggregateReport:
|
||||
@@ -1407,6 +1636,7 @@ def parse_report_file(
|
||||
dns_timeout=dns_timeout,
|
||||
strip_attachment_payloads=sa,
|
||||
keep_alive=keep_alive,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
except InvalidDMARCReport:
|
||||
raise ParserError("Not a valid report")
|
||||
@@ -1414,15 +1644,16 @@ def parse_report_file(
|
||||
|
||||
|
||||
def get_dmarc_reports_from_mbox(
|
||||
input_,
|
||||
nameservers=None,
|
||||
dns_timeout=2.0,
|
||||
strip_attachment_payloads=False,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
input_: str,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 2.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
normalize_timespan_threshold_hours: float = 24.0,
|
||||
):
|
||||
"""Parses a mailbox in mbox format containing e-mails with attached
|
||||
DMARC reports
|
||||
@@ -1439,6 +1670,7 @@ def get_dmarc_reports_from_mbox(
|
||||
reverse_dns_map_url (str): URL to a reverse DNS map file
|
||||
ip_db_path (str): Path to a MMDB file from MaxMind or DBIP
|
||||
offline (bool): Do not make online queries for geolocation or DNS
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
|
||||
Returns:
|
||||
OrderedDict: Lists of ``aggregate_reports`` and ``forensic_reports``
|
||||
@@ -1468,9 +1700,20 @@ def get_dmarc_reports_from_mbox(
|
||||
nameservers=nameservers,
|
||||
dns_timeout=dns_timeout,
|
||||
strip_attachment_payloads=sa,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
if parsed_email["report_type"] == "aggregate":
|
||||
aggregate_reports.append(parsed_email["report"])
|
||||
report_org = parsed_email["report"]["report_metadata"]["org_name"]
|
||||
report_id = parsed_email["report"]["report_metadata"]["report_id"]
|
||||
report_key = f"{report_org}_{report_id}"
|
||||
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
|
||||
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
|
||||
aggregate_reports.append(parsed_email["report"])
|
||||
else:
|
||||
logger.debug(
|
||||
"Skipping duplicate aggregate report "
|
||||
f"from {report_org} with ID: {report_id}"
|
||||
)
|
||||
elif parsed_email["report_type"] == "forensic":
|
||||
forensic_reports.append(parsed_email["report"])
|
||||
elif parsed_email["report_type"] == "smtp_tls":
|
||||
@@ -1490,22 +1733,23 @@ def get_dmarc_reports_from_mbox(
|
||||
|
||||
def get_dmarc_reports_from_mailbox(
|
||||
connection: MailboxConnection,
|
||||
reports_folder="INBOX",
|
||||
archive_folder="Archive",
|
||||
delete=False,
|
||||
test=False,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
nameservers=None,
|
||||
dns_timeout=6.0,
|
||||
strip_attachment_payloads=False,
|
||||
results=None,
|
||||
batch_size=10,
|
||||
since=None,
|
||||
create_folders=True,
|
||||
reports_folder: str = "INBOX",
|
||||
archive_folder: str = "Archive",
|
||||
delete: bool = False,
|
||||
test: bool = False,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: str = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 6.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
results: dict = None,
|
||||
batch_size: int = 10,
|
||||
since: datetime = None,
|
||||
create_folders: bool = True,
|
||||
normalize_timespan_threshold_hours: float = 24,
|
||||
):
|
||||
"""
|
||||
Fetches and parses DMARC reports from a mailbox
|
||||
@@ -1532,6 +1776,7 @@ def get_dmarc_reports_from_mailbox(
|
||||
(units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"})
|
||||
create_folders (bool): Whether to create the destination folders
|
||||
(not used in watch)
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
|
||||
Returns:
|
||||
OrderedDict: Lists of ``aggregate_reports`` and ``forensic_reports``
|
||||
@@ -1570,7 +1815,7 @@ def get_dmarc_reports_from_mailbox(
|
||||
|
||||
if since:
|
||||
_since = 1440 # default one day
|
||||
if re.match(r"\d+[mhd]$", since):
|
||||
if re.match(r"\d+[mhdw]$", since):
|
||||
s = re.split(r"(\d+)", since)
|
||||
if s[2] == "m":
|
||||
_since = int(s[1])
|
||||
@@ -1594,14 +1839,18 @@ def get_dmarc_reports_from_mailbox(
|
||||
"Only days and weeks values in 'since' option are \
|
||||
considered for IMAP conections. Examples: 2d or 1w"
|
||||
)
|
||||
since = (datetime.utcnow() - timedelta(minutes=_since)).date()
|
||||
current_time = datetime.utcnow().date()
|
||||
since = (datetime.now(timezone.utc) - timedelta(minutes=_since)).date()
|
||||
current_time = datetime.now(timezone.utc).date()
|
||||
elif isinstance(connection, MSGraphConnection):
|
||||
since = (datetime.utcnow() - timedelta(minutes=_since)).isoformat() + "Z"
|
||||
current_time = datetime.utcnow().isoformat() + "Z"
|
||||
since = (
|
||||
datetime.now(timezone.utc) - timedelta(minutes=_since)
|
||||
).isoformat() + "Z"
|
||||
current_time = datetime.now(timezone.utc).isoformat() + "Z"
|
||||
elif isinstance(connection, GmailConnection):
|
||||
since = (datetime.utcnow() - timedelta(minutes=_since)).strftime("%s")
|
||||
current_time = datetime.utcnow().strftime("%s")
|
||||
since = (datetime.now(timezone.utc) - timedelta(minutes=_since)).strftime(
|
||||
"%s"
|
||||
)
|
||||
current_time = datetime.now(timezone.utc).strftime("%s")
|
||||
else:
|
||||
pass
|
||||
|
||||
@@ -1645,9 +1894,19 @@ def get_dmarc_reports_from_mailbox(
|
||||
offline=offline,
|
||||
strip_attachment_payloads=sa,
|
||||
keep_alive=connection.keepalive,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
if parsed_email["report_type"] == "aggregate":
|
||||
aggregate_reports.append(parsed_email["report"])
|
||||
report_org = parsed_email["report"]["report_metadata"]["org_name"]
|
||||
report_id = parsed_email["report"]["report_metadata"]["report_id"]
|
||||
report_key = f"{report_org}_{report_id}"
|
||||
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
|
||||
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
|
||||
aggregate_reports.append(parsed_email["report"])
|
||||
else:
|
||||
logger.debug(
|
||||
f"Skipping duplicate aggregate report with ID: {report_id}"
|
||||
)
|
||||
aggregate_report_msg_uids.append(msg_uid)
|
||||
elif parsed_email["report_type"] == "forensic":
|
||||
forensic_reports.append(parsed_email["report"])
|
||||
@@ -1688,7 +1947,7 @@ def get_dmarc_reports_from_mailbox(
|
||||
|
||||
except Exception as e:
|
||||
message = "Error deleting message UID"
|
||||
e = "{0} {1}: " "{2}".format(message, msg_uid, e)
|
||||
e = "{0} {1}: {2}".format(message, msg_uid, e)
|
||||
logger.error("Mailbox error: {0}".format(e))
|
||||
else:
|
||||
if len(aggregate_report_msg_uids) > 0:
|
||||
@@ -1787,6 +2046,7 @@ def get_dmarc_reports_from_mailbox(
|
||||
reverse_dns_map_url=reverse_dns_map_url,
|
||||
offline=offline,
|
||||
since=current_time,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -1795,20 +2055,21 @@ def get_dmarc_reports_from_mailbox(
|
||||
def watch_inbox(
|
||||
mailbox_connection: MailboxConnection,
|
||||
callback: Callable,
|
||||
reports_folder="INBOX",
|
||||
archive_folder="Archive",
|
||||
delete=False,
|
||||
test=False,
|
||||
check_timeout=30,
|
||||
ip_db_path=None,
|
||||
always_use_local_files=False,
|
||||
reverse_dns_map_path=None,
|
||||
reverse_dns_map_url=None,
|
||||
offline=False,
|
||||
nameservers=None,
|
||||
dns_timeout=6.0,
|
||||
strip_attachment_payloads=False,
|
||||
batch_size=None,
|
||||
reports_folder: str = "INBOX",
|
||||
archive_folder: str = "Archive",
|
||||
delete: bool = False,
|
||||
test: bool = False,
|
||||
check_timeout: int = 30,
|
||||
ip_db_path: str = None,
|
||||
always_use_local_files: bool = False,
|
||||
reverse_dns_map_path: str = None,
|
||||
reverse_dns_map_url: str = None,
|
||||
offline: bool = False,
|
||||
nameservers: list[str] = None,
|
||||
dns_timeout: float = 6.0,
|
||||
strip_attachment_payloads: bool = False,
|
||||
batch_size: int = None,
|
||||
normalize_timespan_threshold_hours: float = 24,
|
||||
):
|
||||
"""
|
||||
Watches the mailbox for new messages and
|
||||
@@ -1834,6 +2095,7 @@ def watch_inbox(
|
||||
strip_attachment_payloads (bool): Replace attachment payloads in
|
||||
forensic report samples with None
|
||||
batch_size (int): Number of messages to read and process before saving
|
||||
normalize_timespan_threshold_hours (float): Normalize timespans beyond this
|
||||
"""
|
||||
|
||||
def check_callback(connection):
|
||||
@@ -1854,6 +2116,7 @@ def watch_inbox(
|
||||
strip_attachment_payloads=sa,
|
||||
batch_size=batch_size,
|
||||
create_folders=False,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
callback(res)
|
||||
|
||||
@@ -1896,14 +2159,14 @@ def append_csv(filename, csv):
|
||||
|
||||
|
||||
def save_output(
|
||||
results,
|
||||
output_directory="output",
|
||||
aggregate_json_filename="aggregate.json",
|
||||
forensic_json_filename="forensic.json",
|
||||
smtp_tls_json_filename="smtp_tls.json",
|
||||
aggregate_csv_filename="aggregate.csv",
|
||||
forensic_csv_filename="forensic.csv",
|
||||
smtp_tls_csv_filename="smtp_tls.csv",
|
||||
results: OrderedDict,
|
||||
output_directory: str = "output",
|
||||
aggregate_json_filename: str = "aggregate.json",
|
||||
forensic_json_filename: str = "forensic.json",
|
||||
smtp_tls_json_filename: str = "smtp_tls.json",
|
||||
aggregate_csv_filename: str = "aggregate.csv",
|
||||
forensic_csv_filename: str = "forensic.csv",
|
||||
smtp_tls_csv_filename: str = "smtp_tls.csv",
|
||||
):
|
||||
"""
|
||||
Save report data in the given directory
|
||||
@@ -1981,7 +2244,7 @@ def save_output(
|
||||
sample_file.write(sample)
|
||||
|
||||
|
||||
def get_report_zip(results):
|
||||
def get_report_zip(results: OrderedDict):
|
||||
"""
|
||||
Creates a zip file of parsed report output
|
||||
|
||||
|
||||
@@ -9,11 +9,13 @@ from configparser import ConfigParser
|
||||
from glob import glob
|
||||
import logging
|
||||
import math
|
||||
import yaml
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
from ssl import CERT_NONE, create_default_context
|
||||
from multiprocessing import Pipe, Process
|
||||
import sys
|
||||
import http.client
|
||||
from tqdm import tqdm
|
||||
|
||||
from parsedmarc import (
|
||||
@@ -45,7 +47,10 @@ from parsedmarc.mail import (
|
||||
from parsedmarc.mail.graph import AuthMethod
|
||||
|
||||
from parsedmarc.log import logger
|
||||
from parsedmarc.utils import is_mbox, get_reverse_dns
|
||||
from parsedmarc.utils import is_mbox, get_reverse_dns, get_base_domain
|
||||
from parsedmarc import SEEN_AGGREGATE_REPORT_IDS
|
||||
|
||||
http.client._MAXHEADERS = 200 # pylint:disable=protected-access
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(levelname)8s:%(filename)s:%(lineno)d:%(message)s",
|
||||
@@ -72,6 +77,7 @@ def cli_parse(
|
||||
always_use_local_files,
|
||||
reverse_dns_map_path,
|
||||
reverse_dns_map_url,
|
||||
normalize_timespan_threshold_hours,
|
||||
conn,
|
||||
):
|
||||
"""Separated this function for multiprocessing"""
|
||||
@@ -86,6 +92,7 @@ def cli_parse(
|
||||
nameservers=nameservers,
|
||||
dns_timeout=dns_timeout,
|
||||
strip_attachment_payloads=sa,
|
||||
normalize_timespan_threshold_hours=normalize_timespan_threshold_hours,
|
||||
)
|
||||
conn.send([file_results, file_path])
|
||||
except ParserError as error:
|
||||
@@ -97,8 +104,35 @@ def cli_parse(
|
||||
def _main():
|
||||
"""Called when the module is executed"""
|
||||
|
||||
def get_index_prefix(report):
|
||||
if index_prefix_domain_map is None:
|
||||
return None
|
||||
if "policy_published" in report:
|
||||
domain = report["policy_published"]["domain"]
|
||||
elif "reported_domain" in report:
|
||||
domain = report("reported_domain")
|
||||
elif "policies" in report:
|
||||
domain = report["policies"][0]["domain"]
|
||||
if domain:
|
||||
domain = get_base_domain(domain)
|
||||
for prefix in index_prefix_domain_map:
|
||||
if domain in index_prefix_domain_map[prefix]:
|
||||
prefix = (
|
||||
prefix.lower()
|
||||
.strip()
|
||||
.strip("_")
|
||||
.replace(" ", "_")
|
||||
.replace("-", "_")
|
||||
)
|
||||
prefix = f"{prefix}_"
|
||||
return prefix
|
||||
return None
|
||||
|
||||
def process_reports(reports_):
|
||||
output_str = "{0}\n".format(json.dumps(reports_, ensure_ascii=False, indent=2))
|
||||
indent_value = 2 if opts.prettify_json else None
|
||||
output_str = "{0}\n".format(
|
||||
json.dumps(reports_, ensure_ascii=False, indent=indent_value)
|
||||
)
|
||||
|
||||
if not opts.silent:
|
||||
print(output_str)
|
||||
@@ -122,7 +156,8 @@ def _main():
|
||||
elastic.save_aggregate_report_to_elasticsearch(
|
||||
report,
|
||||
index_suffix=opts.elasticsearch_index_suffix,
|
||||
index_prefix=opts.elasticsearch_index_prefix,
|
||||
index_prefix=opts.elasticsearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.elasticsearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -143,7 +178,8 @@ def _main():
|
||||
opensearch.save_aggregate_report_to_opensearch(
|
||||
report,
|
||||
index_suffix=opts.opensearch_index_suffix,
|
||||
index_prefix=opts.opensearch_index_prefix,
|
||||
index_prefix=opts.opensearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.opensearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -185,8 +221,9 @@ def _main():
|
||||
|
||||
try:
|
||||
if opts.webhook_aggregate_url:
|
||||
indent_value = 2 if opts.prettify_json else None
|
||||
webhook_client.save_aggregate_report_to_webhook(
|
||||
json.dumps(report, ensure_ascii=False, indent=2)
|
||||
json.dumps(report, ensure_ascii=False, indent=indent_value)
|
||||
)
|
||||
except Exception as error_:
|
||||
logger.error("Webhook Error: {0}".format(error_.__str__()))
|
||||
@@ -208,7 +245,8 @@ def _main():
|
||||
elastic.save_forensic_report_to_elasticsearch(
|
||||
report,
|
||||
index_suffix=opts.elasticsearch_index_suffix,
|
||||
index_prefix=opts.elasticsearch_index_prefix,
|
||||
index_prefix=opts.elasticsearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.elasticsearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -227,7 +265,8 @@ def _main():
|
||||
opensearch.save_forensic_report_to_opensearch(
|
||||
report,
|
||||
index_suffix=opts.opensearch_index_suffix,
|
||||
index_prefix=opts.opensearch_index_prefix,
|
||||
index_prefix=opts.opensearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.opensearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -267,8 +306,9 @@ def _main():
|
||||
|
||||
try:
|
||||
if opts.webhook_forensic_url:
|
||||
indent_value = 2 if opts.prettify_json else None
|
||||
webhook_client.save_forensic_report_to_webhook(
|
||||
json.dumps(report, ensure_ascii=False, indent=2)
|
||||
json.dumps(report, ensure_ascii=False, indent=indent_value)
|
||||
)
|
||||
except Exception as error_:
|
||||
logger.error("Webhook Error: {0}".format(error_.__str__()))
|
||||
@@ -290,7 +330,8 @@ def _main():
|
||||
elastic.save_smtp_tls_report_to_elasticsearch(
|
||||
report,
|
||||
index_suffix=opts.elasticsearch_index_suffix,
|
||||
index_prefix=opts.elasticsearch_index_prefix,
|
||||
index_prefix=opts.elasticsearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.elasticsearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -309,7 +350,8 @@ def _main():
|
||||
opensearch.save_smtp_tls_report_to_opensearch(
|
||||
report,
|
||||
index_suffix=opts.opensearch_index_suffix,
|
||||
index_prefix=opts.opensearch_index_prefix,
|
||||
index_prefix=opts.opensearch_index_prefix
|
||||
or get_index_prefix(report),
|
||||
monthly_indexes=opts.opensearch_monthly_indexes,
|
||||
number_of_shards=shards,
|
||||
number_of_replicas=replicas,
|
||||
@@ -349,8 +391,9 @@ def _main():
|
||||
|
||||
try:
|
||||
if opts.webhook_smtp_tls_url:
|
||||
indent_value = 2 if opts.prettify_json else None
|
||||
webhook_client.save_smtp_tls_report_to_webhook(
|
||||
json.dumps(report, ensure_ascii=False, indent=2)
|
||||
json.dumps(report, ensure_ascii=False, indent=indent_value)
|
||||
)
|
||||
except Exception as error_:
|
||||
logger.error("Webhook Error: {0}".format(error_.__str__()))
|
||||
@@ -395,7 +438,7 @@ def _main():
|
||||
arg_parser.add_argument(
|
||||
"-c",
|
||||
"--config-file",
|
||||
help="a path to a configuration file " "(--silent implied)",
|
||||
help="a path to a configuration file (--silent implied)",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"file_path",
|
||||
@@ -403,7 +446,7 @@ def _main():
|
||||
help="one or more paths to aggregate or forensic "
|
||||
"report files, emails, or mbox files'",
|
||||
)
|
||||
strip_attachment_help = "remove attachment payloads from forensic " "report output"
|
||||
strip_attachment_help = "remove attachment payloads from forensic report output"
|
||||
arg_parser.add_argument(
|
||||
"--strip-attachment-payloads", help=strip_attachment_help, action="store_true"
|
||||
)
|
||||
@@ -446,14 +489,14 @@ def _main():
|
||||
arg_parser.add_argument(
|
||||
"-t",
|
||||
"--dns_timeout",
|
||||
help="number of seconds to wait for an answer " "from DNS (default: 2.0)",
|
||||
help="number of seconds to wait for an answer from DNS (default: 2.0)",
|
||||
type=float,
|
||||
default=2.0,
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"--offline",
|
||||
action="store_true",
|
||||
help="do not make online queries for geolocation " " or DNS",
|
||||
help="do not make online queries for geolocation or DNS",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"-s", "--silent", action="store_true", help="only print errors"
|
||||
@@ -471,6 +514,12 @@ def _main():
|
||||
"--debug", action="store_true", help="print debugging information"
|
||||
)
|
||||
arg_parser.add_argument("--log-file", default=None, help="output logging to a file")
|
||||
arg_parser.add_argument(
|
||||
"--no-prettify-json",
|
||||
action="store_false",
|
||||
dest="prettify_json",
|
||||
help="output JSON in a single line without indentation",
|
||||
)
|
||||
arg_parser.add_argument("-v", "--version", action="version", version=__version__)
|
||||
|
||||
aggregate_reports = []
|
||||
@@ -500,6 +549,7 @@ def _main():
|
||||
dns_timeout=args.dns_timeout,
|
||||
debug=args.debug,
|
||||
verbose=args.verbose,
|
||||
prettify_json=args.prettify_json,
|
||||
save_aggregate=False,
|
||||
save_forensic=False,
|
||||
save_smtp_tls=False,
|
||||
@@ -527,6 +577,7 @@ def _main():
|
||||
graph_tenant_id=None,
|
||||
graph_mailbox=None,
|
||||
graph_allow_unencrypted_storage=False,
|
||||
graph_url="https://graph.microsoft.com",
|
||||
hec=None,
|
||||
hec_token=None,
|
||||
hec_index=None,
|
||||
@@ -610,6 +661,7 @@ def _main():
|
||||
webhook_forensic_url=None,
|
||||
webhook_smtp_tls_url=None,
|
||||
webhook_timeout=60,
|
||||
normalize_timespan_threshold_hours=24.0,
|
||||
)
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
@@ -620,9 +672,19 @@ def _main():
|
||||
exit(-1)
|
||||
opts.silent = True
|
||||
config = ConfigParser()
|
||||
index_prefix_domain_map = None
|
||||
config.read(args.config_file)
|
||||
if "general" in config.sections():
|
||||
general_config = config["general"]
|
||||
if "silent" in general_config:
|
||||
opts.silent = general_config.getboolean("silent")
|
||||
if "normalize_timespan_threshold_hours" in general_config:
|
||||
opts.normalize_timespan_threshold_hours = general_config.getfloat(
|
||||
"normalize_timespan_threshold_hours"
|
||||
)
|
||||
if "index_prefix_domain_map" in general_config:
|
||||
with open(general_config["index_prefix_domain_map"]) as f:
|
||||
index_prefix_domain_map = yaml.safe_load(f)
|
||||
if "offline" in general_config:
|
||||
opts.offline = general_config.getboolean("offline")
|
||||
if "strip_attachment_payloads" in general_config:
|
||||
@@ -696,6 +758,8 @@ def _main():
|
||||
opts.reverse_dns_map_path = general_config["reverse_dns_path"]
|
||||
if "reverse_dns_map_url" in general_config:
|
||||
opts.reverse_dns_map_url = general_config["reverse_dns_url"]
|
||||
if "prettify_json" in general_config:
|
||||
opts.prettify_json = general_config.getboolean("prettify_json")
|
||||
|
||||
if "mailbox" in config.sections():
|
||||
mailbox_config = config["mailbox"]
|
||||
@@ -729,7 +793,7 @@ def _main():
|
||||
if "host" in imap_config:
|
||||
opts.imap_host = imap_config["host"]
|
||||
else:
|
||||
logger.error("host setting missing from the " "imap config section")
|
||||
logger.error("host setting missing from the imap config section")
|
||||
exit(-1)
|
||||
if "port" in imap_config:
|
||||
opts.imap_port = imap_config.getint("port")
|
||||
@@ -745,14 +809,12 @@ def _main():
|
||||
if "user" in imap_config:
|
||||
opts.imap_user = imap_config["user"]
|
||||
else:
|
||||
logger.critical("user setting missing from the " "imap config section")
|
||||
logger.critical("user setting missing from the imap config section")
|
||||
exit(-1)
|
||||
if "password" in imap_config:
|
||||
opts.imap_password = imap_config["password"]
|
||||
else:
|
||||
logger.critical(
|
||||
"password setting missing from the " "imap config section"
|
||||
)
|
||||
logger.critical("password setting missing from the imap config section")
|
||||
exit(-1)
|
||||
if "reports_folder" in imap_config:
|
||||
opts.mailbox_reports_folder = imap_config["reports_folder"]
|
||||
@@ -821,21 +883,20 @@ def _main():
|
||||
opts.graph_user = graph_config["user"]
|
||||
else:
|
||||
logger.critical(
|
||||
"user setting missing from the " "msgraph config section"
|
||||
"user setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "password" in graph_config:
|
||||
opts.graph_password = graph_config["password"]
|
||||
else:
|
||||
logger.critical(
|
||||
"password setting missing from the " "msgraph config section"
|
||||
"password setting missing from the msgraph config section"
|
||||
)
|
||||
if "client_secret" in graph_config:
|
||||
opts.graph_client_secret = graph_config["client_secret"]
|
||||
else:
|
||||
logger.critical(
|
||||
"client_secret setting missing from the "
|
||||
"msgraph config section"
|
||||
"client_secret setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
@@ -848,7 +909,7 @@ def _main():
|
||||
opts.graph_tenant_id = graph_config["tenant_id"]
|
||||
else:
|
||||
logger.critical(
|
||||
"tenant_id setting missing from the " "msgraph config section"
|
||||
"tenant_id setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
@@ -857,8 +918,7 @@ def _main():
|
||||
opts.graph_client_secret = graph_config["client_secret"]
|
||||
else:
|
||||
logger.critical(
|
||||
"client_secret setting missing from the "
|
||||
"msgraph config section"
|
||||
"client_secret setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
@@ -866,7 +926,7 @@ def _main():
|
||||
opts.graph_client_id = graph_config["client_id"]
|
||||
else:
|
||||
logger.critical(
|
||||
"client_id setting missing from the " "msgraph config section"
|
||||
"client_id setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
@@ -874,10 +934,13 @@ def _main():
|
||||
opts.graph_mailbox = graph_config["mailbox"]
|
||||
elif opts.graph_auth_method != AuthMethod.UsernamePassword.name:
|
||||
logger.critical(
|
||||
"mailbox setting missing from the " "msgraph config section"
|
||||
"mailbox setting missing from the msgraph config section"
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
if "graph_url" in graph_config:
|
||||
opts.graph_url = graph_config["graph_url"]
|
||||
|
||||
if "allow_unencrypted_storage" in graph_config:
|
||||
opts.graph_allow_unencrypted_storage = graph_config.getboolean(
|
||||
"allow_unencrypted_storage"
|
||||
@@ -889,7 +952,7 @@ def _main():
|
||||
opts.elasticsearch_hosts = _str_to_list(elasticsearch_config["hosts"])
|
||||
else:
|
||||
logger.critical(
|
||||
"hosts setting missing from the " "elasticsearch config section"
|
||||
"hosts setting missing from the elasticsearch config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "timeout" in elasticsearch_config:
|
||||
@@ -927,7 +990,7 @@ def _main():
|
||||
opts.opensearch_hosts = _str_to_list(opensearch_config["hosts"])
|
||||
else:
|
||||
logger.critical(
|
||||
"hosts setting missing from the " "opensearch config section"
|
||||
"hosts setting missing from the opensearch config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "timeout" in opensearch_config:
|
||||
@@ -963,21 +1026,21 @@ def _main():
|
||||
opts.hec = hec_config["url"]
|
||||
else:
|
||||
logger.critical(
|
||||
"url setting missing from the " "splunk_hec config section"
|
||||
"url setting missing from the splunk_hec config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "token" in hec_config:
|
||||
opts.hec_token = hec_config["token"]
|
||||
else:
|
||||
logger.critical(
|
||||
"token setting missing from the " "splunk_hec config section"
|
||||
"token setting missing from the splunk_hec config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "index" in hec_config:
|
||||
opts.hec_index = hec_config["index"]
|
||||
else:
|
||||
logger.critical(
|
||||
"index setting missing from the " "splunk_hec config section"
|
||||
"index setting missing from the splunk_hec config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "skip_certificate_verification" in hec_config:
|
||||
@@ -990,9 +1053,7 @@ def _main():
|
||||
if "hosts" in kafka_config:
|
||||
opts.kafka_hosts = _str_to_list(kafka_config["hosts"])
|
||||
else:
|
||||
logger.critical(
|
||||
"hosts setting missing from the " "kafka config section"
|
||||
)
|
||||
logger.critical("hosts setting missing from the kafka config section")
|
||||
exit(-1)
|
||||
if "user" in kafka_config:
|
||||
opts.kafka_username = kafka_config["user"]
|
||||
@@ -1007,21 +1068,20 @@ def _main():
|
||||
opts.kafka_aggregate_topic = kafka_config["aggregate_topic"]
|
||||
else:
|
||||
logger.critical(
|
||||
"aggregate_topic setting missing from the " "kafka config section"
|
||||
"aggregate_topic setting missing from the kafka config section"
|
||||
)
|
||||
exit(-1)
|
||||
if "forensic_topic" in kafka_config:
|
||||
opts.kafka_forensic_topic = kafka_config["forensic_topic"]
|
||||
else:
|
||||
logger.critical(
|
||||
"forensic_topic setting missing from the " "kafka config section"
|
||||
"forensic_topic setting missing from the kafka config section"
|
||||
)
|
||||
if "smtp_tls_topic" in kafka_config:
|
||||
opts.kafka_smtp_tls_topic = kafka_config["smtp_tls_topic"]
|
||||
else:
|
||||
logger.critical(
|
||||
"forensic_topic setting missing from the "
|
||||
"splunk_hec config section"
|
||||
"forensic_topic setting missing from the splunk_hec config section"
|
||||
)
|
||||
|
||||
if "smtp" in config.sections():
|
||||
@@ -1029,7 +1089,7 @@ def _main():
|
||||
if "host" in smtp_config:
|
||||
opts.smtp_host = smtp_config["host"]
|
||||
else:
|
||||
logger.critical("host setting missing from the " "smtp config section")
|
||||
logger.critical("host setting missing from the smtp config section")
|
||||
exit(-1)
|
||||
if "port" in smtp_config:
|
||||
opts.smtp_port = smtp_config.getint("port")
|
||||
@@ -1041,23 +1101,21 @@ def _main():
|
||||
if "user" in smtp_config:
|
||||
opts.smtp_user = smtp_config["user"]
|
||||
else:
|
||||
logger.critical("user setting missing from the " "smtp config section")
|
||||
logger.critical("user setting missing from the smtp config section")
|
||||
exit(-1)
|
||||
if "password" in smtp_config:
|
||||
opts.smtp_password = smtp_config["password"]
|
||||
else:
|
||||
logger.critical(
|
||||
"password setting missing from the " "smtp config section"
|
||||
)
|
||||
logger.critical("password setting missing from the smtp config section")
|
||||
exit(-1)
|
||||
if "from" in smtp_config:
|
||||
opts.smtp_from = smtp_config["from"]
|
||||
else:
|
||||
logger.critical("from setting missing from the " "smtp config section")
|
||||
logger.critical("from setting missing from the smtp config section")
|
||||
if "to" in smtp_config:
|
||||
opts.smtp_to = _str_to_list(smtp_config["to"])
|
||||
else:
|
||||
logger.critical("to setting missing from the " "smtp config section")
|
||||
logger.critical("to setting missing from the smtp config section")
|
||||
if "subject" in smtp_config:
|
||||
opts.smtp_subject = smtp_config["subject"]
|
||||
if "attachment" in smtp_config:
|
||||
@@ -1070,7 +1128,7 @@ def _main():
|
||||
if "bucket" in s3_config:
|
||||
opts.s3_bucket = s3_config["bucket"]
|
||||
else:
|
||||
logger.critical("bucket setting missing from the " "s3 config section")
|
||||
logger.critical("bucket setting missing from the s3 config section")
|
||||
exit(-1)
|
||||
if "path" in s3_config:
|
||||
opts.s3_path = s3_config["path"]
|
||||
@@ -1095,9 +1153,7 @@ def _main():
|
||||
if "server" in syslog_config:
|
||||
opts.syslog_server = syslog_config["server"]
|
||||
else:
|
||||
logger.critical(
|
||||
"server setting missing from the " "syslog config section"
|
||||
)
|
||||
logger.critical("server setting missing from the syslog config section")
|
||||
exit(-1)
|
||||
if "port" in syslog_config:
|
||||
opts.syslog_port = syslog_config["port"]
|
||||
@@ -1148,17 +1204,17 @@ def _main():
|
||||
if "host" in gelf_config:
|
||||
opts.gelf_host = gelf_config["host"]
|
||||
else:
|
||||
logger.critical("host setting missing from the " "gelf config section")
|
||||
logger.critical("host setting missing from the gelf config section")
|
||||
exit(-1)
|
||||
if "port" in gelf_config:
|
||||
opts.gelf_port = gelf_config["port"]
|
||||
else:
|
||||
logger.critical("port setting missing from the " "gelf config section")
|
||||
logger.critical("port setting missing from the gelf config section")
|
||||
exit(-1)
|
||||
if "mode" in gelf_config:
|
||||
opts.gelf_mode = gelf_config["mode"]
|
||||
else:
|
||||
logger.critical("mode setting missing from the " "gelf config section")
|
||||
logger.critical("mode setting missing from the gelf config section")
|
||||
exit(-1)
|
||||
|
||||
if "webhook" in config.sections():
|
||||
@@ -1170,7 +1226,7 @@ def _main():
|
||||
if "smtp_tls_url" in webhook_config:
|
||||
opts.webhook_smtp_tls_url = webhook_config["smtp_tls_url"]
|
||||
if "timeout" in webhook_config:
|
||||
opts.webhook_timeout = webhook_config["timeout"]
|
||||
opts.webhook_timeout = webhook_config.getint("timeout")
|
||||
|
||||
logger.setLevel(logging.ERROR)
|
||||
|
||||
@@ -1184,8 +1240,7 @@ def _main():
|
||||
try:
|
||||
fh = logging.FileHandler(opts.log_file, "a")
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - "
|
||||
"%(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
"%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
)
|
||||
fh.setFormatter(formatter)
|
||||
logger.addHandler(fh)
|
||||
@@ -1293,7 +1348,7 @@ def _main():
|
||||
|
||||
if opts.hec:
|
||||
if opts.hec_token is None or opts.hec_index is None:
|
||||
logger.error("HEC token and HEC index are required when " "using HEC URL")
|
||||
logger.error("HEC token and HEC index are required when using HEC URL")
|
||||
exit(1)
|
||||
|
||||
verify = True
|
||||
@@ -1396,6 +1451,7 @@ def _main():
|
||||
opts.always_use_local_files,
|
||||
opts.reverse_dns_map_path,
|
||||
opts.reverse_dns_map_url,
|
||||
opts.normalize_timespan_threshold_hours,
|
||||
child_conn,
|
||||
),
|
||||
)
|
||||
@@ -1418,7 +1474,17 @@ def _main():
|
||||
logger.error("Failed to parse {0} - {1}".format(result[1], result[0]))
|
||||
else:
|
||||
if result[0]["report_type"] == "aggregate":
|
||||
aggregate_reports.append(result[0]["report"])
|
||||
report_org = result[0]["report"]["report_metadata"]["org_name"]
|
||||
report_id = result[0]["report"]["report_metadata"]["report_id"]
|
||||
report_key = f"{report_org}_{report_id}"
|
||||
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
|
||||
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
|
||||
aggregate_reports.append(result[0]["report"])
|
||||
else:
|
||||
logger.debug(
|
||||
"Skipping duplicate aggregate report "
|
||||
f"from {report_org} with ID: {report_id}"
|
||||
)
|
||||
elif result[0]["report_type"] == "forensic":
|
||||
forensic_reports.append(result[0]["report"])
|
||||
elif result[0]["report_type"] == "smtp_tls":
|
||||
@@ -1436,6 +1502,7 @@ def _main():
|
||||
reverse_dns_map_path=opts.reverse_dns_map_path,
|
||||
reverse_dns_map_url=opts.reverse_dns_map_url,
|
||||
offline=opts.offline,
|
||||
normalize_timespan_threshold_hours=opts.normalize_timespan_threshold_hours,
|
||||
)
|
||||
aggregate_reports += reports["aggregate_reports"]
|
||||
forensic_reports += reports["forensic_reports"]
|
||||
@@ -1446,7 +1513,7 @@ def _main():
|
||||
try:
|
||||
if opts.imap_user is None or opts.imap_password is None:
|
||||
logger.error(
|
||||
"IMAP user and password must be specified if" "host is specified"
|
||||
"IMAP user and password must be specified ifhost is specified"
|
||||
)
|
||||
|
||||
ssl = True
|
||||
@@ -1485,6 +1552,7 @@ def _main():
|
||||
password=opts.graph_password,
|
||||
token_file=opts.graph_token_file,
|
||||
allow_unencrypted_storage=opts.graph_allow_unencrypted_storage,
|
||||
graph_url=opts.graph_url,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
@@ -1544,6 +1612,7 @@ def _main():
|
||||
test=opts.mailbox_test,
|
||||
strip_attachment_payloads=opts.strip_attachment_payloads,
|
||||
since=opts.mailbox_since,
|
||||
normalize_timespan_threshold_hours=opts.normalize_timespan_threshold_hours,
|
||||
)
|
||||
|
||||
aggregate_reports += reports["aggregate_reports"]
|
||||
@@ -1579,6 +1648,7 @@ def _main():
|
||||
username=opts.smtp_user,
|
||||
password=opts.smtp_password,
|
||||
subject=opts.smtp_subject,
|
||||
require_encryption=opts.smtp_ssl,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to email results")
|
||||
@@ -1605,6 +1675,7 @@ def _main():
|
||||
reverse_dns_map_path=opts.reverse_dns_map_path,
|
||||
reverse_dns_map_url=opts.reverse_dns_map_url,
|
||||
offline=opts.offline,
|
||||
normalize_timespan_threshold_hours=opts.normalize_timespan_threshold_hours,
|
||||
)
|
||||
except FileExistsError as error:
|
||||
logger.error("{0}".format(error.__str__()))
|
||||
|
||||
2
parsedmarc/constants.py
Normal file
2
parsedmarc/constants.py
Normal file
@@ -0,0 +1,2 @@
|
||||
__version__ = "9.0.0"
|
||||
USER_AGENT = f"parsedmarc/{__version__}"
|
||||
@@ -67,6 +67,8 @@ class _AggregateReportDoc(Document):
|
||||
date_range = Date()
|
||||
date_begin = Date()
|
||||
date_end = Date()
|
||||
normalized_timespan = Boolean()
|
||||
original_timespan_seconds = Integer
|
||||
errors = Text()
|
||||
published_policy = Object(_PublishedPolicy)
|
||||
source_ip_address = Ip()
|
||||
@@ -393,52 +395,7 @@ def save_aggregate_report_to_elasticsearch(
|
||||
org_name = metadata["org_name"]
|
||||
report_id = metadata["report_id"]
|
||||
domain = aggregate_report["policy_published"]["domain"]
|
||||
begin_date = human_timestamp_to_datetime(metadata["begin_date"], to_utc=True)
|
||||
end_date = human_timestamp_to_datetime(metadata["end_date"], to_utc=True)
|
||||
begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
if monthly_indexes:
|
||||
index_date = begin_date.strftime("%Y-%m")
|
||||
else:
|
||||
index_date = begin_date.strftime("%Y-%m-%d")
|
||||
aggregate_report["begin_date"] = begin_date
|
||||
aggregate_report["end_date"] = end_date
|
||||
date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]]
|
||||
|
||||
org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
|
||||
report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
|
||||
domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
|
||||
begin_date_query = Q(dict(match=dict(date_begin=begin_date)))
|
||||
end_date_query = Q(dict(match=dict(date_end=end_date)))
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_aggregate_{0}*".format(index_suffix)
|
||||
else:
|
||||
search_index = "dmarc_aggregate*"
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
query = org_name_query & report_id_query & domain_query
|
||||
query = query & begin_date_query & end_date_query
|
||||
search.query = query
|
||||
|
||||
try:
|
||||
existing = search.execute()
|
||||
except Exception as error_:
|
||||
raise ElasticsearchError(
|
||||
"Elasticsearch's search for existing report \
|
||||
error: {}".format(error_.__str__())
|
||||
)
|
||||
|
||||
if len(existing) > 0:
|
||||
raise AlreadySaved(
|
||||
"An aggregate report ID {0} from {1} about {2} "
|
||||
"with a date range of {3} UTC to {4} UTC already "
|
||||
"exists in "
|
||||
"Elasticsearch".format(
|
||||
report_id, org_name, domain, begin_date_human, end_date_human
|
||||
)
|
||||
)
|
||||
|
||||
published_policy = _PublishedPolicy(
|
||||
domain=aggregate_report["policy_published"]["domain"],
|
||||
adkim=aggregate_report["policy_published"]["adkim"],
|
||||
@@ -450,6 +407,52 @@ def save_aggregate_report_to_elasticsearch(
|
||||
)
|
||||
|
||||
for record in aggregate_report["records"]:
|
||||
begin_date = human_timestamp_to_datetime(record["interval_begin"], to_utc=True)
|
||||
end_date = human_timestamp_to_datetime(record["interval_end"], to_utc=True)
|
||||
begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
if monthly_indexes:
|
||||
index_date = begin_date.strftime("%Y-%m")
|
||||
else:
|
||||
index_date = begin_date.strftime("%Y-%m-%d")
|
||||
aggregate_report["begin_date"] = begin_date
|
||||
aggregate_report["end_date"] = end_date
|
||||
date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]]
|
||||
|
||||
org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
|
||||
report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
|
||||
domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
|
||||
begin_date_query = Q(dict(match=dict(date_begin=begin_date)))
|
||||
end_date_query = Q(dict(match=dict(date_end=end_date)))
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_aggregate_{0}*".format(index_suffix)
|
||||
else:
|
||||
search_index = "dmarc_aggregate*"
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
query = org_name_query & report_id_query & domain_query
|
||||
query = query & begin_date_query & end_date_query
|
||||
search.query = query
|
||||
|
||||
try:
|
||||
existing = search.execute()
|
||||
except Exception as error_:
|
||||
raise ElasticsearchError(
|
||||
"Elasticsearch's search for existing report \
|
||||
error: {}".format(error_.__str__())
|
||||
)
|
||||
|
||||
if len(existing) > 0:
|
||||
raise AlreadySaved(
|
||||
"An aggregate report ID {0} from {1} about {2} "
|
||||
"with a date range of {3} UTC to {4} UTC already "
|
||||
"exists in "
|
||||
"Elasticsearch".format(
|
||||
report_id, org_name, domain, begin_date_human, end_date_human
|
||||
)
|
||||
)
|
||||
agg_doc = _AggregateReportDoc(
|
||||
xml_schema=aggregate_report["xml_schema"],
|
||||
org_name=metadata["org_name"],
|
||||
@@ -459,6 +462,7 @@ def save_aggregate_report_to_elasticsearch(
|
||||
date_range=date_range,
|
||||
date_begin=aggregate_report["begin_date"],
|
||||
date_end=aggregate_report["end_date"],
|
||||
normalized_timespan=record["normalized_timespan"],
|
||||
errors=metadata["errors"],
|
||||
published_policy=published_policy,
|
||||
source_ip_address=record["source"]["ip_address"],
|
||||
@@ -552,8 +556,8 @@ def save_forensic_report_to_elasticsearch(
|
||||
for original_header in original_headers:
|
||||
headers[original_header.lower()] = original_headers[original_header]
|
||||
|
||||
arrival_date_human = forensic_report["arrival_date_utc"]
|
||||
arrival_date = human_timestamp_to_datetime(arrival_date_human)
|
||||
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
|
||||
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_forensic_{0}*".format(index_suffix)
|
||||
@@ -562,20 +566,35 @@ def save_forensic_report_to_elasticsearch(
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
arrival_query = {"match": {"arrival_date": arrival_date}}
|
||||
q = Q(arrival_query)
|
||||
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
|
||||
|
||||
from_ = None
|
||||
to_ = None
|
||||
subject = None
|
||||
if "from" in headers:
|
||||
from_ = headers["from"]
|
||||
from_query = {"match_phrase": {"sample.headers.from": from_}}
|
||||
q = q & Q(from_query)
|
||||
# We convert the FROM header from a string list to a flat string.
|
||||
headers["from"] = headers["from"][0]
|
||||
if headers["from"][0] == "":
|
||||
headers["from"] = headers["from"][1]
|
||||
else:
|
||||
headers["from"] = " <".join(headers["from"]) + ">"
|
||||
|
||||
from_ = dict()
|
||||
from_["sample.headers.from"] = headers["from"]
|
||||
from_query = Q(dict(match_phrase=from_))
|
||||
q = q & from_query
|
||||
if "to" in headers:
|
||||
to_ = headers["to"]
|
||||
to_query = {"match_phrase": {"sample.headers.to": to_}}
|
||||
q = q & Q(to_query)
|
||||
# We convert the TO header from a string list to a flat string.
|
||||
headers["to"] = headers["to"][0]
|
||||
if headers["to"][0] == "":
|
||||
headers["to"] = headers["to"][1]
|
||||
else:
|
||||
headers["to"] = " <".join(headers["to"]) + ">"
|
||||
|
||||
to_ = dict()
|
||||
to_["sample.headers.to"] = headers["to"]
|
||||
to_query = Q(dict(match_phrase=to_))
|
||||
q = q & to_query
|
||||
if "subject" in headers:
|
||||
subject = headers["subject"]
|
||||
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
|
||||
@@ -589,7 +608,9 @@ def save_forensic_report_to_elasticsearch(
|
||||
"A forensic sample to {0} from {1} "
|
||||
"with a subject of {2} and arrival date of {3} "
|
||||
"already exists in "
|
||||
"Elasticsearch".format(to_, from_, subject, arrival_date_human)
|
||||
"Elasticsearch".format(
|
||||
to_, from_, subject, forensic_report["arrival_date_utc"]
|
||||
)
|
||||
)
|
||||
|
||||
parsed_sample = forensic_report["parsed_sample"]
|
||||
@@ -625,7 +646,7 @@ def save_forensic_report_to_elasticsearch(
|
||||
user_agent=forensic_report["user_agent"],
|
||||
version=forensic_report["version"],
|
||||
original_mail_from=forensic_report["original_mail_from"],
|
||||
arrival_date=arrival_date,
|
||||
arrival_date=arrival_date_epoch_milliseconds,
|
||||
domain=forensic_report["reported_domain"],
|
||||
original_envelope_id=forensic_report["original_envelope_id"],
|
||||
authentication_results=forensic_report["authentication_results"],
|
||||
|
||||
@@ -63,9 +63,7 @@ class GmailConnection(MailboxConnection):
|
||||
).execute()
|
||||
except HttpError as e:
|
||||
if e.status_code == 409:
|
||||
logger.debug(
|
||||
f"Folder {folder_name} already exists, " f"skipping creation"
|
||||
)
|
||||
logger.debug(f"Folder {folder_name} already exists, skipping creation")
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
self,
|
||||
auth_method: str,
|
||||
mailbox: str,
|
||||
graph_url: str,
|
||||
client_id: str,
|
||||
client_secret: str,
|
||||
username: str,
|
||||
@@ -108,7 +109,10 @@ class MSGraphConnection(MailboxConnection):
|
||||
token_path=token_path,
|
||||
allow_unencrypted_storage=allow_unencrypted_storage,
|
||||
)
|
||||
client_params = {"credential": credential}
|
||||
client_params = {
|
||||
"credential": credential,
|
||||
"cloud": graph_url,
|
||||
}
|
||||
if not isinstance(credential, ClientSecretCredential):
|
||||
scopes = ["Mail.ReadWrite"]
|
||||
# Detect if mailbox is shared
|
||||
@@ -137,16 +141,16 @@ class MSGraphConnection(MailboxConnection):
|
||||
request_url = f"/users/{self.mailbox_name}/mailFolders{sub_url}"
|
||||
resp = self._client.post(request_url, json=request_body)
|
||||
if resp.status_code == 409:
|
||||
logger.debug(f"Folder {folder_name} already exists, " f"skipping creation")
|
||||
logger.debug(f"Folder {folder_name} already exists, skipping creation")
|
||||
elif resp.status_code == 201:
|
||||
logger.debug(f"Created folder {folder_name}")
|
||||
else:
|
||||
logger.warning(f"Unknown response " f"{resp.status_code} {resp.json()}")
|
||||
logger.warning(f"Unknown response {resp.status_code} {resp.json()}")
|
||||
|
||||
def fetch_messages(self, folder_name: str, **kwargs) -> List[str]:
|
||||
"""Returns a list of message UIDs in the specified folder"""
|
||||
folder_id = self._find_folder_id_from_folder_path(folder_name)
|
||||
url = f"/users/{self.mailbox_name}/mailFolders/" f"{folder_id}/messages"
|
||||
url = f"/users/{self.mailbox_name}/mailFolders/{folder_id}/messages"
|
||||
since = kwargs.get("since")
|
||||
if not since:
|
||||
since = None
|
||||
@@ -185,7 +189,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
resp = self._client.patch(url, json={"isRead": "true"})
|
||||
if resp.status_code != 200:
|
||||
raise RuntimeWarning(
|
||||
f"Failed to mark message read" f"{resp.status_code}: {resp.json()}"
|
||||
f"Failed to mark message read{resp.status_code}: {resp.json()}"
|
||||
)
|
||||
|
||||
def fetch_message(self, message_id: str, **kwargs):
|
||||
@@ -193,7 +197,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
result = self._client.get(url)
|
||||
if result.status_code != 200:
|
||||
raise RuntimeWarning(
|
||||
f"Failed to fetch message" f"{result.status_code}: {result.json()}"
|
||||
f"Failed to fetch message{result.status_code}: {result.json()}"
|
||||
)
|
||||
mark_read = kwargs.get("mark_read")
|
||||
if mark_read:
|
||||
@@ -205,7 +209,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
resp = self._client.delete(url)
|
||||
if resp.status_code != 204:
|
||||
raise RuntimeWarning(
|
||||
f"Failed to delete message " f"{resp.status_code}: {resp.json()}"
|
||||
f"Failed to delete message {resp.status_code}: {resp.json()}"
|
||||
)
|
||||
|
||||
def move_message(self, message_id: str, folder_name: str):
|
||||
@@ -215,7 +219,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
resp = self._client.post(url, json=request_body)
|
||||
if resp.status_code != 201:
|
||||
raise RuntimeWarning(
|
||||
f"Failed to move message " f"{resp.status_code}: {resp.json()}"
|
||||
f"Failed to move message {resp.status_code}: {resp.json()}"
|
||||
)
|
||||
|
||||
def keepalive(self):
|
||||
@@ -250,7 +254,7 @@ class MSGraphConnection(MailboxConnection):
|
||||
filter = f"?$filter=displayName eq '{folder_name}'"
|
||||
folders_resp = self._client.get(url + filter)
|
||||
if folders_resp.status_code != 200:
|
||||
raise RuntimeWarning(f"Failed to list folders." f"{folders_resp.json()}")
|
||||
raise RuntimeWarning(f"Failed to list folders.{folders_resp.json()}")
|
||||
folders: list = folders_resp.json()["value"]
|
||||
matched_folders = [
|
||||
folder for folder in folders if folder["displayName"] == folder_name
|
||||
|
||||
@@ -85,7 +85,5 @@ class IMAPConnection(MailboxConnection):
|
||||
logger.warning("IMAP connection timeout. Reconnecting...")
|
||||
sleep(check_timeout)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"IMAP connection error. {0}. " "Reconnecting...".format(e)
|
||||
)
|
||||
logger.warning("IMAP connection error. {0}. Reconnecting...".format(e))
|
||||
sleep(check_timeout)
|
||||
|
||||
@@ -67,6 +67,8 @@ class _AggregateReportDoc(Document):
|
||||
date_range = Date()
|
||||
date_begin = Date()
|
||||
date_end = Date()
|
||||
normalized_timespan = Boolean()
|
||||
original_timespan_seconds = Integer
|
||||
errors = Text()
|
||||
published_policy = Object(_PublishedPolicy)
|
||||
source_ip_address = Ip()
|
||||
@@ -202,13 +204,15 @@ class _SMTPTLSPolicyDoc(InnerDoc):
|
||||
receiving_ip,
|
||||
receiving_mx_helo,
|
||||
failed_session_count,
|
||||
sending_mta_ip=None,
|
||||
receiving_mx_hostname=None,
|
||||
additional_information_uri=None,
|
||||
failure_reason_code=None,
|
||||
):
|
||||
self.failure_details.append(
|
||||
_details = _SMTPTLSFailureDetailsDoc(
|
||||
result_type=result_type,
|
||||
ip_address=ip_address,
|
||||
sending_mta_ip=sending_mta_ip,
|
||||
receiving_mx_hostname=receiving_mx_hostname,
|
||||
receiving_mx_helo=receiving_mx_helo,
|
||||
receiving_ip=receiving_ip,
|
||||
@@ -216,9 +220,10 @@ class _SMTPTLSPolicyDoc(InnerDoc):
|
||||
additional_information=additional_information_uri,
|
||||
failure_reason_code=failure_reason_code,
|
||||
)
|
||||
self.failure_details.append(_details)
|
||||
|
||||
|
||||
class _SMTPTLSFailureReportDoc(Document):
|
||||
class _SMTPTLSReportDoc(Document):
|
||||
class Index:
|
||||
name = "smtp_tls"
|
||||
|
||||
@@ -390,52 +395,7 @@ def save_aggregate_report_to_opensearch(
|
||||
org_name = metadata["org_name"]
|
||||
report_id = metadata["report_id"]
|
||||
domain = aggregate_report["policy_published"]["domain"]
|
||||
begin_date = human_timestamp_to_datetime(metadata["begin_date"], to_utc=True)
|
||||
end_date = human_timestamp_to_datetime(metadata["end_date"], to_utc=True)
|
||||
begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
if monthly_indexes:
|
||||
index_date = begin_date.strftime("%Y-%m")
|
||||
else:
|
||||
index_date = begin_date.strftime("%Y-%m-%d")
|
||||
aggregate_report["begin_date"] = begin_date
|
||||
aggregate_report["end_date"] = end_date
|
||||
date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]]
|
||||
|
||||
org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
|
||||
report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
|
||||
domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
|
||||
begin_date_query = Q(dict(match=dict(date_begin=begin_date)))
|
||||
end_date_query = Q(dict(match=dict(date_end=end_date)))
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_aggregate_{0}*".format(index_suffix)
|
||||
else:
|
||||
search_index = "dmarc_aggregate*"
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
query = org_name_query & report_id_query & domain_query
|
||||
query = query & begin_date_query & end_date_query
|
||||
search.query = query
|
||||
|
||||
try:
|
||||
existing = search.execute()
|
||||
except Exception as error_:
|
||||
raise OpenSearchError(
|
||||
"OpenSearch's search for existing report \
|
||||
error: {}".format(error_.__str__())
|
||||
)
|
||||
|
||||
if len(existing) > 0:
|
||||
raise AlreadySaved(
|
||||
"An aggregate report ID {0} from {1} about {2} "
|
||||
"with a date range of {3} UTC to {4} UTC already "
|
||||
"exists in "
|
||||
"OpenSearch".format(
|
||||
report_id, org_name, domain, begin_date_human, end_date_human
|
||||
)
|
||||
)
|
||||
|
||||
published_policy = _PublishedPolicy(
|
||||
domain=aggregate_report["policy_published"]["domain"],
|
||||
adkim=aggregate_report["policy_published"]["adkim"],
|
||||
@@ -447,6 +407,52 @@ def save_aggregate_report_to_opensearch(
|
||||
)
|
||||
|
||||
for record in aggregate_report["records"]:
|
||||
begin_date = human_timestamp_to_datetime(record["interval_begin"], to_utc=True)
|
||||
end_date = human_timestamp_to_datetime(record["interval_end"], to_utc=True)
|
||||
begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%SZ")
|
||||
if monthly_indexes:
|
||||
index_date = begin_date.strftime("%Y-%m")
|
||||
else:
|
||||
index_date = begin_date.strftime("%Y-%m-%d")
|
||||
aggregate_report["begin_date"] = begin_date
|
||||
aggregate_report["end_date"] = end_date
|
||||
date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]]
|
||||
|
||||
org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
|
||||
report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
|
||||
domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
|
||||
begin_date_query = Q(dict(match=dict(date_begin=begin_date)))
|
||||
end_date_query = Q(dict(match=dict(date_end=end_date)))
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_aggregate_{0}*".format(index_suffix)
|
||||
else:
|
||||
search_index = "dmarc_aggregate*"
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
query = org_name_query & report_id_query & domain_query
|
||||
query = query & begin_date_query & end_date_query
|
||||
search.query = query
|
||||
|
||||
try:
|
||||
existing = search.execute()
|
||||
except Exception as error_:
|
||||
raise OpenSearchError(
|
||||
"OpenSearch's search for existing report \
|
||||
error: {}".format(error_.__str__())
|
||||
)
|
||||
|
||||
if len(existing) > 0:
|
||||
raise AlreadySaved(
|
||||
"An aggregate report ID {0} from {1} about {2} "
|
||||
"with a date range of {3} UTC to {4} UTC already "
|
||||
"exists in "
|
||||
"OpenSearch".format(
|
||||
report_id, org_name, domain, begin_date_human, end_date_human
|
||||
)
|
||||
)
|
||||
agg_doc = _AggregateReportDoc(
|
||||
xml_schema=aggregate_report["xml_schema"],
|
||||
org_name=metadata["org_name"],
|
||||
@@ -499,6 +505,7 @@ def save_aggregate_report_to_opensearch(
|
||||
index = "{0}_{1}".format(index, index_suffix)
|
||||
if index_prefix:
|
||||
index = "{0}{1}".format(index_prefix, index)
|
||||
|
||||
index = "{0}-{1}".format(index, index_date)
|
||||
index_settings = dict(
|
||||
number_of_shards=number_of_shards, number_of_replicas=number_of_replicas
|
||||
@@ -548,8 +555,8 @@ def save_forensic_report_to_opensearch(
|
||||
for original_header in original_headers:
|
||||
headers[original_header.lower()] = original_headers[original_header]
|
||||
|
||||
arrival_date_human = forensic_report["arrival_date_utc"]
|
||||
arrival_date = human_timestamp_to_datetime(arrival_date_human)
|
||||
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
|
||||
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
|
||||
|
||||
if index_suffix is not None:
|
||||
search_index = "dmarc_forensic_{0}*".format(index_suffix)
|
||||
@@ -558,20 +565,35 @@ def save_forensic_report_to_opensearch(
|
||||
if index_prefix is not None:
|
||||
search_index = "{0}{1}".format(index_prefix, search_index)
|
||||
search = Search(index=search_index)
|
||||
arrival_query = {"match": {"arrival_date": arrival_date}}
|
||||
q = Q(arrival_query)
|
||||
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
|
||||
|
||||
from_ = None
|
||||
to_ = None
|
||||
subject = None
|
||||
if "from" in headers:
|
||||
from_ = headers["from"]
|
||||
from_query = {"match_phrase": {"sample.headers.from": from_}}
|
||||
q = q & Q(from_query)
|
||||
# We convert the FROM header from a string list to a flat string.
|
||||
headers["from"] = headers["from"][0]
|
||||
if headers["from"][0] == "":
|
||||
headers["from"] = headers["from"][1]
|
||||
else:
|
||||
headers["from"] = " <".join(headers["from"]) + ">"
|
||||
|
||||
from_ = dict()
|
||||
from_["sample.headers.from"] = headers["from"]
|
||||
from_query = Q(dict(match_phrase=from_))
|
||||
q = q & from_query
|
||||
if "to" in headers:
|
||||
to_ = headers["to"]
|
||||
to_query = {"match_phrase": {"sample.headers.to": to_}}
|
||||
q = q & Q(to_query)
|
||||
# We convert the TO header from a string list to a flat string.
|
||||
headers["to"] = headers["to"][0]
|
||||
if headers["to"][0] == "":
|
||||
headers["to"] = headers["to"][1]
|
||||
else:
|
||||
headers["to"] = " <".join(headers["to"]) + ">"
|
||||
|
||||
to_ = dict()
|
||||
to_["sample.headers.to"] = headers["to"]
|
||||
to_query = Q(dict(match_phrase=to_))
|
||||
q = q & to_query
|
||||
if "subject" in headers:
|
||||
subject = headers["subject"]
|
||||
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
|
||||
@@ -585,7 +607,9 @@ def save_forensic_report_to_opensearch(
|
||||
"A forensic sample to {0} from {1} "
|
||||
"with a subject of {2} and arrival date of {3} "
|
||||
"already exists in "
|
||||
"OpenSearch".format(to_, from_, subject, arrival_date_human)
|
||||
"OpenSearch".format(
|
||||
to_, from_, subject, forensic_report["arrival_date_utc"]
|
||||
)
|
||||
)
|
||||
|
||||
parsed_sample = forensic_report["parsed_sample"]
|
||||
@@ -621,7 +645,7 @@ def save_forensic_report_to_opensearch(
|
||||
user_agent=forensic_report["user_agent"],
|
||||
version=forensic_report["version"],
|
||||
original_mail_from=forensic_report["original_mail_from"],
|
||||
arrival_date=arrival_date,
|
||||
arrival_date=arrival_date_epoch_milliseconds,
|
||||
domain=forensic_report["reported_domain"],
|
||||
original_envelope_id=forensic_report["original_envelope_id"],
|
||||
authentication_results=forensic_report["authentication_results"],
|
||||
@@ -685,7 +709,7 @@ def save_smtp_tls_report_to_opensearch(
|
||||
AlreadySaved
|
||||
"""
|
||||
logger.info("Saving aggregate report to OpenSearch")
|
||||
org_name = report["org_name"]
|
||||
org_name = report["organization_name"]
|
||||
report_id = report["report_id"]
|
||||
begin_date = human_timestamp_to_datetime(report["begin_date"], to_utc=True)
|
||||
end_date = human_timestamp_to_datetime(report["end_date"], to_utc=True)
|
||||
@@ -741,11 +765,11 @@ def save_smtp_tls_report_to_opensearch(
|
||||
number_of_shards=number_of_shards, number_of_replicas=number_of_replicas
|
||||
)
|
||||
|
||||
smtp_tls_doc = _SMTPTLSFailureReportDoc(
|
||||
organization_name=report["organization_name"],
|
||||
date_range=[report["date_begin"], report["date_end"]],
|
||||
date_begin=report["date_begin"],
|
||||
date_end=report["date_end"],
|
||||
smtp_tls_doc = _SMTPTLSReportDoc(
|
||||
org_name=report["organization_name"],
|
||||
date_range=[report["begin_date"], report["end_date"]],
|
||||
date_begin=report["begin_date"],
|
||||
date_end=report["end_date"],
|
||||
contact_info=report["contact_info"],
|
||||
report_id=report["report_id"],
|
||||
)
|
||||
@@ -760,32 +784,48 @@ def save_smtp_tls_report_to_opensearch(
|
||||
policy_doc = _SMTPTLSPolicyDoc(
|
||||
policy_domain=policy["policy_domain"],
|
||||
policy_type=policy["policy_type"],
|
||||
succesful_session_count=policy["successful_session_count"],
|
||||
failed_session_count=policy["failed_session_count"],
|
||||
policy_string=policy_strings,
|
||||
mx_host_patterns=mx_host_patterns,
|
||||
)
|
||||
if "failure_details" in policy:
|
||||
failure_details = policy["failure_details"]
|
||||
receiving_mx_hostname = None
|
||||
additional_information_uri = None
|
||||
failure_reason_code = None
|
||||
if "receiving_mx_hostname" in failure_details:
|
||||
receiving_mx_hostname = failure_details["receiving_mx_hostname"]
|
||||
if "additional_information_uri" in failure_details:
|
||||
additional_information_uri = failure_details[
|
||||
"additional_information_uri"
|
||||
]
|
||||
if "failure_reason_code" in failure_details:
|
||||
failure_reason_code = failure_details["failure_reason_code"]
|
||||
policy_doc.add_failure_details(
|
||||
result_type=failure_details["result_type"],
|
||||
ip_address=failure_details["ip_address"],
|
||||
receiving_ip=failure_details["receiving_ip"],
|
||||
receiving_mx_helo=failure_details["receiving_mx_helo"],
|
||||
failed_session_count=failure_details["failed_session_count"],
|
||||
receiving_mx_hostname=receiving_mx_hostname,
|
||||
additional_information_uri=additional_information_uri,
|
||||
failure_reason_code=failure_reason_code,
|
||||
)
|
||||
for failure_detail in policy["failure_details"]:
|
||||
receiving_mx_hostname = None
|
||||
additional_information_uri = None
|
||||
failure_reason_code = None
|
||||
ip_address = None
|
||||
receiving_ip = None
|
||||
receiving_mx_helo = None
|
||||
sending_mta_ip = None
|
||||
|
||||
if "receiving_mx_hostname" in failure_detail:
|
||||
receiving_mx_hostname = failure_detail["receiving_mx_hostname"]
|
||||
if "additional_information_uri" in failure_detail:
|
||||
additional_information_uri = failure_detail[
|
||||
"additional_information_uri"
|
||||
]
|
||||
if "failure_reason_code" in failure_detail:
|
||||
failure_reason_code = failure_detail["failure_reason_code"]
|
||||
if "ip_address" in failure_detail:
|
||||
ip_address = failure_detail["ip_address"]
|
||||
if "receiving_ip" in failure_detail:
|
||||
receiving_ip = failure_detail["receiving_ip"]
|
||||
if "receiving_mx_helo" in failure_detail:
|
||||
receiving_mx_helo = failure_detail["receiving_mx_helo"]
|
||||
if "sending_mta_ip" in failure_detail:
|
||||
sending_mta_ip = failure_detail["sending_mta_ip"]
|
||||
policy_doc.add_failure_details(
|
||||
result_type=failure_detail["result_type"],
|
||||
ip_address=ip_address,
|
||||
receiving_ip=receiving_ip,
|
||||
receiving_mx_helo=receiving_mx_helo,
|
||||
failed_session_count=failure_detail["failed_session_count"],
|
||||
sending_mta_ip=sending_mta_ip,
|
||||
receiving_mx_hostname=receiving_mx_hostname,
|
||||
additional_information_uri=additional_information_uri,
|
||||
failure_reason_code=failure_reason_code,
|
||||
)
|
||||
smtp_tls_doc.policies.append(policy_doc)
|
||||
|
||||
create_indexes([index], index_settings)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# About
|
||||
|
||||
`dbip-country-lite.mmdb` is provided by [dbip][dbip] under a
|
||||
[ Creative Commons Attribution 4.0 International License][cc].
|
||||
[Creative Commons Attribution 4.0 International License][cc].
|
||||
|
||||
[dbip]: https://db-ip.com/db/lite.php
|
||||
[dbip]: https://db-ip.com/db/download/ip-to-country-lite
|
||||
[cc]: http://creativecommons.org/licenses/by/4.0/
|
||||
|
||||
Binary file not shown.
@@ -3,6 +3,8 @@
|
||||
A mapping is meant to make it easier to identify who or what a sending source is. Please consider contributing
|
||||
additional mappings in a GitHub Pull Request.
|
||||
|
||||
Do not open these CSV files in Excel. It will replace Unicode characters with question marks. Use LibreOffice Calc instead.
|
||||
|
||||
## base_reverse_dns_map.csv
|
||||
|
||||
A CSV file with three fields: `base_reverse_dns`, `name`, and `type`.
|
||||
@@ -19,33 +21,72 @@ The `service_type` is based on the following rule precedence:
|
||||
3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting.
|
||||
4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting.
|
||||
5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted
|
||||
6. All legitimate platforms offering their Software as a Service SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
|
||||
6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
|
||||
7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry
|
||||
|
||||
- Agriculture
|
||||
- Automotive
|
||||
- Beauty
|
||||
- Conglomerate
|
||||
- Construction
|
||||
- Consulting
|
||||
- Defense
|
||||
- Education
|
||||
- Email Provider
|
||||
- Email Security
|
||||
- Education
|
||||
- Entertainment
|
||||
- Event Planning
|
||||
- Finance
|
||||
- Food
|
||||
- Government
|
||||
- Government Media
|
||||
- Healthcare
|
||||
- IaaS
|
||||
- Industrial
|
||||
- ISP
|
||||
- Legal
|
||||
- Logistics
|
||||
- Manufacturing
|
||||
- Marketing
|
||||
- MSP
|
||||
- MSSP
|
||||
- News
|
||||
- Nonprofit
|
||||
- PaaS
|
||||
- Photography
|
||||
- Physical Security
|
||||
- Print
|
||||
- Publishing
|
||||
- Real Estate
|
||||
- Retail
|
||||
- SaaS
|
||||
- Science
|
||||
- Search Engine
|
||||
- Social Media
|
||||
- Sports
|
||||
- Staffing
|
||||
- Technology
|
||||
- Travel
|
||||
- Web Host
|
||||
|
||||
The file currently contains over 600 mappings from a wide variety of email sending services, including large email
|
||||
providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to
|
||||
include many other services and industries.
|
||||
The file currently contains over 1,400 mappings from a wide variety of email sending sources.
|
||||
|
||||
## known_unknown_base_reverse_dns.txt
|
||||
|
||||
A list of reverse DNS base domains that could not be identified as belonging to a particular organization, service, or industry.
|
||||
|
||||
## base_reverse_dns.csv
|
||||
|
||||
A CSV with the fields `source_name` and optionally `message_count`. This CSV can be generated by exporting the base DNS data from the Kibana or Splunk dashboards provided by parsedmarc. This file is not tracked by Git.
|
||||
|
||||
## unknown_base_reverse_dns.csv
|
||||
|
||||
A CSV file with the fields `source_name` and `message_count`. This file is not tracked by Git.
|
||||
|
||||
## find_bad_utf8.py
|
||||
|
||||
Locates invalid UTF-8 bytes in files and optionally tries to current them. Generated by GPT5. Helped me find where I had introduced invalid bytes in `base_reverse_dns_map.csv`.
|
||||
|
||||
## find_unknown_base_reverse_dns.py
|
||||
|
||||
This is a python script that reads the domains in `base_reverse_dns.csv` and writes the domains that are not in `base_reverse_dns_map.csv` or `known_unknown_base_reverse_dns.txt` to `unknown_base_reverse_dns.csv`. This is useful for identifying potential additional domains to contribute to `base_reverse_dns_map.csv` and `known_unknown_base_reverse_dns.txt`.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
44
parsedmarc/resources/maps/base_reverse_dns_types.txt
Normal file
44
parsedmarc/resources/maps/base_reverse_dns_types.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
Agriculture
|
||||
Automotive
|
||||
Beauty
|
||||
Conglomerate
|
||||
Construction
|
||||
Consulting
|
||||
Defense
|
||||
Education
|
||||
Email Provider
|
||||
Email Security
|
||||
Entertainment
|
||||
Event Planning
|
||||
Finance
|
||||
Food
|
||||
Government
|
||||
Government Media
|
||||
Healthcare
|
||||
ISP
|
||||
IaaS
|
||||
Industrial
|
||||
Legal
|
||||
Logistics
|
||||
MSP
|
||||
MSSP
|
||||
Manufacturing
|
||||
Marketing
|
||||
News
|
||||
Nonprofit
|
||||
PaaS
|
||||
Photography
|
||||
Physical Security
|
||||
Print
|
||||
Publishing
|
||||
Real Estate
|
||||
Retail
|
||||
SaaS
|
||||
Science
|
||||
Search Engine
|
||||
Social Media
|
||||
Sports
|
||||
Staffing
|
||||
Technology
|
||||
Travel
|
||||
Web Host
|
||||
488
parsedmarc/resources/maps/find_bad_utf8.py
Executable file
488
parsedmarc/resources/maps/find_bad_utf8.py
Executable file
@@ -0,0 +1,488 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
from typing import List, Tuple
|
||||
|
||||
"""
|
||||
Locates and optionally corrects bad UTF-8 bytes in a file.
|
||||
Generated by GPT-5 Use at your own risk.
|
||||
"""
|
||||
|
||||
# -------------------------
|
||||
# UTF-8 scanning
|
||||
# -------------------------
|
||||
|
||||
|
||||
def scan_line_for_utf8_errors(
|
||||
line_bytes: bytes, line_no: int, base_offset: int, context: int
|
||||
):
|
||||
"""
|
||||
Scan one line of raw bytes for UTF-8 decoding errors.
|
||||
Returns a list of dicts describing each error.
|
||||
"""
|
||||
pos = 0
|
||||
results = []
|
||||
while pos < len(line_bytes):
|
||||
dec = codecs.getincrementaldecoder("utf-8")("strict")
|
||||
try:
|
||||
dec.decode(line_bytes[pos:], final=True)
|
||||
break
|
||||
except UnicodeDecodeError as e:
|
||||
rel_index = e.start
|
||||
abs_index_in_line = pos + rel_index
|
||||
abs_offset = base_offset + abs_index_in_line
|
||||
|
||||
start_ctx = max(0, abs_index_in_line - context)
|
||||
end_ctx = min(len(line_bytes), abs_index_in_line + 1 + context)
|
||||
ctx_bytes = line_bytes[start_ctx:end_ctx]
|
||||
bad_byte = line_bytes[abs_index_in_line : abs_index_in_line + 1]
|
||||
col = abs_index_in_line + 1 # 1-based byte column
|
||||
|
||||
results.append(
|
||||
{
|
||||
"line": line_no,
|
||||
"column": col,
|
||||
"abs_offset": abs_offset,
|
||||
"bad_byte_hex": bad_byte.hex(),
|
||||
"context_hex": ctx_bytes.hex(),
|
||||
"context_preview": ctx_bytes.decode("utf-8", errors="replace"),
|
||||
}
|
||||
)
|
||||
# Move past the offending byte and continue
|
||||
pos = abs_index_in_line + 1
|
||||
return results
|
||||
|
||||
|
||||
def scan_file_for_utf8_errors(path: str, context: int, limit: int):
|
||||
errors_found = 0
|
||||
limit_val = limit if limit != 0 else float("inf")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
total_offset = 0
|
||||
line_no = 0
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line_no += 1
|
||||
results = scan_line_for_utf8_errors(line, line_no, total_offset, context)
|
||||
for r in results:
|
||||
errors_found += 1
|
||||
print(
|
||||
f"[ERROR {errors_found}] Line {r['line']}, Column {r['column']}, "
|
||||
f"Absolute byte offset {r['abs_offset']}"
|
||||
)
|
||||
print(f" Bad byte: 0x{r['bad_byte_hex']}")
|
||||
print(f" Context (hex): {r['context_hex']}")
|
||||
print(f" Context (preview): {r['context_preview']}")
|
||||
print()
|
||||
if errors_found >= limit_val:
|
||||
print(f"Reached limit of {limit} errors. Stopping.")
|
||||
return errors_found
|
||||
total_offset += len(line)
|
||||
|
||||
if errors_found == 0:
|
||||
print("No invalid UTF-8 bytes found. 🎉")
|
||||
else:
|
||||
print(f"Found {errors_found} invalid UTF-8 byte(s).")
|
||||
return errors_found
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Whole-file conversion
|
||||
# -------------------------
|
||||
|
||||
|
||||
def detect_encoding_text(path: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Use charset-normalizer to detect file encoding.
|
||||
Return (encoding_name, decoded_text). Falls back to cp1252 if needed.
|
||||
"""
|
||||
try:
|
||||
from charset_normalizer import from_path
|
||||
except ImportError:
|
||||
print(
|
||||
"Please install charset-normalizer: pip install charset-normalizer",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(4)
|
||||
|
||||
matches = from_path(path)
|
||||
match = matches.best()
|
||||
if match is None or match.encoding is None:
|
||||
# Fallback heuristic for Western single-byte text
|
||||
with open(path, "rb") as fb:
|
||||
data = fb.read()
|
||||
try:
|
||||
return "cp1252", data.decode("cp1252", errors="strict")
|
||||
except UnicodeDecodeError:
|
||||
print("Unable to detect encoding reliably.", file=sys.stderr)
|
||||
sys.exit(5)
|
||||
|
||||
return match.encoding, str(match)
|
||||
|
||||
|
||||
def convert_to_utf8(src_path: str, out_path: str, src_encoding: str = None) -> str:
|
||||
"""
|
||||
Convert an entire file to UTF-8 (re-decoding everything).
|
||||
If src_encoding is provided, use it; else auto-detect.
|
||||
Returns the encoding actually used.
|
||||
"""
|
||||
if src_encoding:
|
||||
with open(src_path, "rb") as fb:
|
||||
data = fb.read()
|
||||
try:
|
||||
text = data.decode(src_encoding, errors="strict")
|
||||
except LookupError:
|
||||
print(f"Unknown encoding: {src_encoding}", file=sys.stderr)
|
||||
sys.exit(6)
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Decoding failed with {src_encoding}: {e}", file=sys.stderr)
|
||||
sys.exit(7)
|
||||
used = src_encoding
|
||||
else:
|
||||
used, text = detect_encoding_text(src_path)
|
||||
|
||||
with open(out_path, "w", encoding="utf-8", newline="") as fw:
|
||||
fw.write(text)
|
||||
return used
|
||||
|
||||
|
||||
def verify_utf8_file(path: str) -> Tuple[bool, str]:
|
||||
try:
|
||||
with open(path, "rb") as fb:
|
||||
fb.read().decode("utf-8", errors="strict")
|
||||
return True, ""
|
||||
except UnicodeDecodeError as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Targeted single-byte fixer
|
||||
# -------------------------
|
||||
|
||||
|
||||
def iter_lines_with_offsets(b: bytes):
|
||||
"""
|
||||
Yield (line_bytes, line_start_abs_offset). Preserves LF/CRLF/CR in bytes.
|
||||
"""
|
||||
start = 0
|
||||
for i, byte in enumerate(b):
|
||||
if byte == 0x0A: # LF
|
||||
yield b[start : i + 1], start
|
||||
start = i + 1
|
||||
if start < len(b):
|
||||
yield b[start:], start
|
||||
|
||||
|
||||
def detect_probable_fallbacks() -> List[str]:
|
||||
# Good defaults for Western/Portuguese text
|
||||
return ["cp1252", "iso-8859-1", "iso-8859-15"]
|
||||
|
||||
|
||||
def repair_mixed_utf8_line(line: bytes, base_offset: int, fallback_chain: List[str]):
|
||||
"""
|
||||
Strictly validate UTF-8 and fix *only* the exact offending byte when an error occurs.
|
||||
This avoids touching adjacent valid UTF-8 (prevents mojibake like 'é').
|
||||
"""
|
||||
out_fragments: List[str] = []
|
||||
fixes = []
|
||||
pos = 0
|
||||
n = len(line)
|
||||
|
||||
while pos < n:
|
||||
dec = codecs.getincrementaldecoder("utf-8")("strict")
|
||||
try:
|
||||
s = dec.decode(line[pos:], final=True)
|
||||
out_fragments.append(s)
|
||||
break
|
||||
except UnicodeDecodeError as e:
|
||||
# Append the valid prefix before the error
|
||||
if e.start > 0:
|
||||
out_fragments.append(
|
||||
line[pos : pos + e.start].decode("utf-8", errors="strict")
|
||||
)
|
||||
|
||||
bad_index = pos + e.start # absolute index in 'line'
|
||||
bad_slice = line[bad_index : bad_index + 1] # FIX EXACTLY ONE BYTE
|
||||
|
||||
# Decode that single byte using the first working fallback
|
||||
decoded = None
|
||||
used_enc = None
|
||||
for enc in fallback_chain:
|
||||
try:
|
||||
decoded = bad_slice.decode(enc, errors="strict")
|
||||
used_enc = enc
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if decoded is None:
|
||||
# latin-1 always succeeds (byte->same code point)
|
||||
decoded = bad_slice.decode("latin-1")
|
||||
used_enc = "latin-1 (fallback)"
|
||||
|
||||
out_fragments.append(decoded)
|
||||
|
||||
# Log the fix
|
||||
col_1based = bad_index + 1 # byte-based column
|
||||
fixes.append(
|
||||
{
|
||||
"line_base_offset": base_offset,
|
||||
"line": None, # caller fills line number
|
||||
"column": col_1based,
|
||||
"abs_offset": base_offset + bad_index,
|
||||
"bad_bytes_hex": bad_slice.hex(),
|
||||
"used_encoding": used_enc,
|
||||
"replacement_preview": decoded,
|
||||
}
|
||||
)
|
||||
|
||||
# Advance exactly one byte past the offending byte and continue
|
||||
pos = bad_index + 1
|
||||
|
||||
return "".join(out_fragments), fixes
|
||||
|
||||
|
||||
def targeted_fix_to_utf8(
|
||||
src_path: str,
|
||||
out_path: str,
|
||||
fallback_chain: List[str],
|
||||
dry_run: bool,
|
||||
max_fixes: int,
|
||||
):
|
||||
with open(src_path, "rb") as fb:
|
||||
data = fb.read()
|
||||
|
||||
total_fixes = 0
|
||||
repaired_lines: List[str] = []
|
||||
line_no = 0
|
||||
max_val = max_fixes if max_fixes != 0 else float("inf")
|
||||
|
||||
for line_bytes, base_offset in iter_lines_with_offsets(data):
|
||||
line_no += 1
|
||||
# Fast path: keep lines that are already valid UTF-8
|
||||
try:
|
||||
repaired_lines.append(line_bytes.decode("utf-8", errors="strict"))
|
||||
continue
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
fixed_text, fixes = repair_mixed_utf8_line(
|
||||
line_bytes, base_offset, fallback_chain=fallback_chain
|
||||
)
|
||||
for f in fixes:
|
||||
f["line"] = line_no
|
||||
|
||||
repaired_lines.append(fixed_text)
|
||||
|
||||
# Log fixes
|
||||
for f in fixes:
|
||||
total_fixes += 1
|
||||
print(
|
||||
f"[FIX {total_fixes}] Line {f['line']}, Column {f['column']}, Abs offset {f['abs_offset']}"
|
||||
)
|
||||
print(f" Bad bytes: 0x{f['bad_bytes_hex']}")
|
||||
print(f" Used encoding: {f['used_encoding']}")
|
||||
preview = f["replacement_preview"].replace("\r", "\\r").replace("\n", "\\n")
|
||||
if len(preview) > 40:
|
||||
preview = preview[:40] + "…"
|
||||
print(f" Replacement preview: {preview}")
|
||||
print()
|
||||
if total_fixes >= max_val:
|
||||
print(f"Reached max fixes limit ({max_fixes}). Stopping scan.")
|
||||
break
|
||||
if total_fixes >= max_val:
|
||||
break
|
||||
|
||||
if dry_run:
|
||||
print(f"Dry run complete. Detected {total_fixes} fix(es). No file written.")
|
||||
return total_fixes
|
||||
|
||||
# Join and verify result can be encoded to UTF-8
|
||||
repaired_text = "".join(repaired_lines)
|
||||
try:
|
||||
repaired_text.encode("utf-8", errors="strict")
|
||||
except UnicodeEncodeError as e:
|
||||
print(f"Internal error: repaired text not valid UTF-8: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
with open(out_path, "w", encoding="utf-8", newline="") as fw:
|
||||
fw.write(repaired_text)
|
||||
|
||||
print(f"Fixed file written to: {out_path}")
|
||||
print(f"Total fixes applied: {total_fixes}")
|
||||
return total_fixes
|
||||
|
||||
|
||||
# -------------------------
|
||||
# CLI
|
||||
# -------------------------
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Scan for invalid UTF-8; optionally convert whole file or fix only invalid bytes.\n\n"
|
||||
"By default, --convert and --fix **edit the input file in place** and create a backup "
|
||||
"named '<input>.bak' before writing. If you pass --output, the original file is left "
|
||||
"unchanged and no backup is created. Use --dry-run to preview fixes without writing."
|
||||
),
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
ap.add_argument("path", help="Path to the CSV/text file")
|
||||
ap.add_argument(
|
||||
"--context",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Bytes of context to show around errors (default: 20)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Max errors to report during scan (0 = unlimited)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--skip-scan", action="store_true", help="Skip initial scan for speed"
|
||||
)
|
||||
|
||||
# Whole-file convert
|
||||
ap.add_argument(
|
||||
"--convert",
|
||||
action="store_true",
|
||||
help="Convert entire file to UTF-8 using auto/forced encoding "
|
||||
"(in-place by default; creates '<input>.bak').",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--encoding",
|
||||
help="Force source encoding for --convert or first fallback for --fix",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--output",
|
||||
help="Write to this path instead of in-place (no .bak is created in that case)",
|
||||
)
|
||||
|
||||
# Targeted fix
|
||||
ap.add_argument(
|
||||
"--fix",
|
||||
action="store_true",
|
||||
help="Fix only invalid byte(s) via fallback encodings "
|
||||
"(in-place by default; creates '<input>.bak').",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--fallbacks",
|
||||
help="Comma-separated fallback encodings (default: cp1252,iso-8859-1,iso-8859-15)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="(fix) Print fixes but do not write or create a .bak",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--max-fixes",
|
||||
type=int,
|
||||
default=0,
|
||||
help="(fix) Stop after N fixes (0 = unlimited)",
|
||||
)
|
||||
|
||||
args = ap.parse_args()
|
||||
path = args.path
|
||||
|
||||
if not os.path.isfile(path):
|
||||
print(f"File not found: {path}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# Optional scan first
|
||||
if not args.skip_scan:
|
||||
scan_file_for_utf8_errors(path, context=args.context, limit=args.limit)
|
||||
|
||||
# Mode selection guards
|
||||
if args.convert and args.fix:
|
||||
print("Choose either --convert or --fix (not both).", file=sys.stderr)
|
||||
sys.exit(9)
|
||||
if not args.convert and not args.fix and args.skip_scan:
|
||||
print("No action selected (use --convert or --fix).")
|
||||
return
|
||||
if not args.convert and not args.fix:
|
||||
# User only wanted a scan
|
||||
return
|
||||
|
||||
# Determine output path and backup behavior
|
||||
# In-place by default: create '<input>.bak' before overwriting.
|
||||
if args.output:
|
||||
out_path = args.output
|
||||
in_place = False
|
||||
else:
|
||||
out_path = path
|
||||
in_place = True
|
||||
|
||||
# CONVERT mode
|
||||
if args.convert:
|
||||
print("\n[CONVERT MODE] Converting file to UTF-8...")
|
||||
if in_place:
|
||||
# Create backup before overwriting original
|
||||
backup_path = path + ".bak"
|
||||
shutil.copy2(path, backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
used = convert_to_utf8(path, out_path, src_encoding=args.encoding)
|
||||
print(f"Source encoding used: {used}")
|
||||
print(f"Saved UTF-8 file as: {out_path}")
|
||||
ok, err = verify_utf8_file(out_path)
|
||||
if ok:
|
||||
print("Verification: output is valid UTF-8 ✅")
|
||||
else:
|
||||
print(f"Verification failed: {err}")
|
||||
sys.exit(8)
|
||||
return
|
||||
|
||||
# FIX mode (targeted, single-byte)
|
||||
if args.fix:
|
||||
print("\n[FIX MODE] Fixing only invalid bytes to UTF-8...")
|
||||
if args.dry_run:
|
||||
# Dry-run: never write or create backup
|
||||
out_path_effective = os.devnull
|
||||
in_place_effective = False
|
||||
else:
|
||||
out_path_effective = out_path
|
||||
in_place_effective = in_place
|
||||
|
||||
# Build fallback chain (if --encoding provided, try it first)
|
||||
if args.fallbacks:
|
||||
fallback_chain = [e.strip() for e in args.fallbacks.split(",") if e.strip()]
|
||||
else:
|
||||
fallback_chain = detect_probable_fallbacks()
|
||||
if args.encoding and args.encoding not in fallback_chain:
|
||||
fallback_chain = [args.encoding] + fallback_chain
|
||||
|
||||
if in_place_effective:
|
||||
# Create backup before overwriting original (only when actually writing)
|
||||
backup_path = path + ".bak"
|
||||
shutil.copy2(path, backup_path)
|
||||
print(f"Backup created: {backup_path}")
|
||||
|
||||
fix_count = targeted_fix_to_utf8(
|
||||
path,
|
||||
out_path_effective,
|
||||
fallback_chain=fallback_chain,
|
||||
dry_run=args.dry_run,
|
||||
max_fixes=args.max_fixes,
|
||||
)
|
||||
|
||||
if not args.dry_run:
|
||||
ok, err = verify_utf8_file(out_path_effective)
|
||||
if ok:
|
||||
print("Verification: output is valid UTF-8 ✅")
|
||||
print(f"Fix mode completed — {fix_count} byte(s) corrected.")
|
||||
else:
|
||||
print(f"Verification failed: {err}")
|
||||
sys.exit(8)
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
80
parsedmarc/resources/maps/find_unknown_base_reverse_dns.py
Executable file
80
parsedmarc/resources/maps/find_unknown_base_reverse_dns.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import csv
|
||||
|
||||
|
||||
def _main():
|
||||
input_csv_file_path = "base_reverse_dns.csv"
|
||||
base_reverse_dns_map_file_path = "base_reverse_dns_map.csv"
|
||||
known_unknown_list_file_path = "known_unknown_base_reverse_dns.txt"
|
||||
psl_overrides_file_path = "psl_overrides.txt"
|
||||
output_csv_file_path = "unknown_base_reverse_dns.csv"
|
||||
|
||||
csv_headers = ["source_name", "message_count"]
|
||||
|
||||
output_rows = []
|
||||
|
||||
known_unknown_domains = []
|
||||
psl_overrides = []
|
||||
known_domains = []
|
||||
output_rows = []
|
||||
|
||||
def load_list(file_path, list_var):
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Error: {file_path} does not exist")
|
||||
print(f"Loading {file_path}")
|
||||
with open(file_path) as f:
|
||||
for line in f.readlines():
|
||||
domain = line.lower().strip()
|
||||
if domain in list_var:
|
||||
print(f"Error: {domain} is in {file_path} multiple times")
|
||||
exit(1)
|
||||
elif domain != "":
|
||||
list_var.append(domain)
|
||||
|
||||
load_list(known_unknown_list_file_path, known_unknown_domains)
|
||||
load_list(psl_overrides_file_path, psl_overrides)
|
||||
if not os.path.exists(base_reverse_dns_map_file_path):
|
||||
print(f"Error: {base_reverse_dns_map_file_path} does not exist")
|
||||
print(f"Loading {base_reverse_dns_map_file_path}")
|
||||
with open(base_reverse_dns_map_file_path) as f:
|
||||
for row in csv.DictReader(f):
|
||||
domain = row["base_reverse_dns"].lower().strip()
|
||||
if domain in known_domains:
|
||||
print(
|
||||
f"Error: {domain} is in {base_reverse_dns_map_file_path} multiple times"
|
||||
)
|
||||
exit()
|
||||
else:
|
||||
known_domains.append(domain)
|
||||
if domain in known_unknown_domains and known_domains:
|
||||
print(
|
||||
f"Error:{domain} is in {known_unknown_list_file_path} and \
|
||||
{base_reverse_dns_map_file_path}"
|
||||
)
|
||||
exit(1)
|
||||
if not os.path.exists(input_csv_file_path):
|
||||
print(f"Error: {base_reverse_dns_map_file_path} does not exist")
|
||||
exit(1)
|
||||
with open(input_csv_file_path) as f:
|
||||
for row in csv.DictReader(f):
|
||||
domain = row["source_name"].lower().strip()
|
||||
if domain == "":
|
||||
continue
|
||||
for psl_domain in psl_overrides:
|
||||
if domain.endswith(psl_domain):
|
||||
domain = psl_domain.strip(".").strip("-")
|
||||
break
|
||||
if domain not in known_domains and domain not in known_unknown_domains:
|
||||
print(f"New unknown domain found: {domain}")
|
||||
output_rows.append(row)
|
||||
print(f"Writing {output_csv_file_path}")
|
||||
with open(output_csv_file_path, "w") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=csv_headers)
|
||||
writer.writeheader()
|
||||
writer.writerows(output_rows)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_main()
|
||||
601
parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt
Normal file
601
parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt
Normal file
@@ -0,0 +1,601 @@
|
||||
1jli.site
|
||||
26.107
|
||||
444qcuhilla.com
|
||||
4xr1.com
|
||||
9services.com
|
||||
a7e.ru
|
||||
a94434500-blog.com
|
||||
aams8.jp
|
||||
abv-10.top
|
||||
acemail.co.in
|
||||
activaicon.com
|
||||
adcritic.net
|
||||
adlucrumnewsletter.com
|
||||
admin.corpivensa.gob.ve
|
||||
advantageiq.com
|
||||
advrider.ro
|
||||
aerospacevitro.us.com
|
||||
agenturserver.de
|
||||
aghories.com
|
||||
ai270.net
|
||||
albagroup-eg.com
|
||||
alchemy.net
|
||||
alohabeachcamp.net
|
||||
alsiscad.com
|
||||
aluminumpipetubing.com
|
||||
americanstorageca.com
|
||||
amplusserver.info
|
||||
anchorfundhub.com
|
||||
anglishment.com
|
||||
anteldata.net.uy
|
||||
antis.edu
|
||||
antonaoll.com
|
||||
anviklass.org
|
||||
anwrgrp.lat
|
||||
aosau.net
|
||||
arandomserver.com
|
||||
aransk.ru
|
||||
ardcs.cn
|
||||
armninl.met
|
||||
as29550.net
|
||||
asahachimaru.com
|
||||
aserv.co.za
|
||||
asmecam.it
|
||||
ateky.net.br
|
||||
aurelienvos.com
|
||||
automatech.lat
|
||||
avistaadvantage.com
|
||||
b8sales.com
|
||||
bahjs.com
|
||||
baliaura.com
|
||||
banaras.co
|
||||
bearandbullmarketnews.com
|
||||
bestinvestingtime.com
|
||||
bhjui.com
|
||||
biocorp.com
|
||||
biosophy.net
|
||||
bitter-echo.com
|
||||
bizhostingservices.com
|
||||
blguss.com
|
||||
bluenet.ch
|
||||
bluhosting.com
|
||||
bnasg.com
|
||||
bodiax.pp.ua
|
||||
bost-law.com
|
||||
brainity.com
|
||||
brazalnde.net
|
||||
brellatransplc.shop
|
||||
brnonet.cz
|
||||
broadwaycover.com
|
||||
brushinglegal.de
|
||||
brw.net
|
||||
btes.tv
|
||||
budgeteasehub.com
|
||||
buoytoys.com
|
||||
buyjapanese.jp
|
||||
c53dw7m24rj.com
|
||||
cahtelrandom.org
|
||||
casadelmarsamara.com
|
||||
cashflowmasterypro.com
|
||||
cavabeen.com
|
||||
cbti.net
|
||||
centralmalaysia.com
|
||||
chauffeurplan.co.uk
|
||||
checkpox.fun
|
||||
chegouseuvlache.org
|
||||
chinaxingyu.xyz
|
||||
christus.mx
|
||||
churchills.market
|
||||
ci-xyz.fit
|
||||
cisumrecords.com
|
||||
ckaik.cn
|
||||
clcktoact.com
|
||||
cli-eurosignal.cz
|
||||
cloud-admin.it
|
||||
cloud-edm.com
|
||||
cloudflare-email.org
|
||||
cloudhosting.rs
|
||||
cloudlogin.co
|
||||
cloudplatformpro.com
|
||||
cnode.io
|
||||
cntcloud.com
|
||||
code-it.net
|
||||
codefriend.top
|
||||
colombiaceropapel.org
|
||||
commerceinsurance.com
|
||||
comsharempc.com
|
||||
conexiona.com
|
||||
coolblaze.com
|
||||
coowo.com
|
||||
corpemail.net
|
||||
cp2-myorderbox.com
|
||||
cps.com.ar
|
||||
crnagora.net
|
||||
cross-d-bar-troutranch.com
|
||||
ctla.co.kr
|
||||
cumbalikonakhotel.com
|
||||
currencyexconverter.com
|
||||
daakbabu.com
|
||||
daikinmae.com
|
||||
dairyvalley.com.my
|
||||
dastans.ru
|
||||
datahost36.de
|
||||
ddii.network
|
||||
deep-sek.shop
|
||||
deetownsounds.com
|
||||
descarca-counter-strike.net
|
||||
detrot.xyz
|
||||
dettlaffinc.com
|
||||
dextoolse.net
|
||||
digestivedaily.com
|
||||
digi.net.my
|
||||
dinofelis.cn
|
||||
diwkyncbi.top
|
||||
dkginternet.com
|
||||
dnexpress.info
|
||||
dns-oid.com
|
||||
dnsindia.net
|
||||
domainserver.ne.jp
|
||||
domconfig.com
|
||||
doorsrv.com
|
||||
dreampox.fun
|
||||
dreamtechmedia.com
|
||||
ds.network
|
||||
dss-group.net
|
||||
dvj.theworkpc.com
|
||||
dwlcka.com
|
||||
dynamic-wiretel.in
|
||||
dyntcorp.com
|
||||
easternkingspei.com
|
||||
economiceagles.com
|
||||
egosimail.com
|
||||
eliotporterphotos.us
|
||||
emailgids.net
|
||||
emailperegrine.com
|
||||
entendercopilot.com
|
||||
entretothom.net
|
||||
epaycontrol.com
|
||||
epicinvestmentsreview.co
|
||||
epicinvestmentsreview.com
|
||||
epik.com
|
||||
epsilon-group.com
|
||||
erestaff.com
|
||||
euro-trade-gmbh.com
|
||||
example.com
|
||||
exposervers.com-new
|
||||
extendcp.co.uk
|
||||
eyecandyhosting.xyz
|
||||
fastwebnet.it
|
||||
fd9ing7wfn.com
|
||||
feipnghardware.com
|
||||
fetscorp.shop
|
||||
fewo-usedom.net
|
||||
fin-crime.com
|
||||
financeaimpoint.com
|
||||
financeupward.com
|
||||
firmflat.com
|
||||
flex-video.bnr.la
|
||||
flourishfusionlife.com
|
||||
formicidaehunt.net
|
||||
fosterheap.com
|
||||
fredi.shop
|
||||
frontiernet.net
|
||||
ftifb7tk3c.com
|
||||
gamersprotectionvpn.online
|
||||
gendns.com
|
||||
getgreencardsfast.com
|
||||
getthatroi.com
|
||||
gibbshosting.com
|
||||
gigidea.net
|
||||
giize.com
|
||||
ginous.eu.com
|
||||
gis.net
|
||||
gist-th.com
|
||||
globalglennpartners.com
|
||||
goldsboroughplace.com
|
||||
gophermedia.com
|
||||
gqlists.us.com
|
||||
gratzl.de
|
||||
greatestworldnews.com
|
||||
greennutritioncare.com
|
||||
gsbb.com
|
||||
gumbolimbo.net
|
||||
h-serv.co.uk
|
||||
haedefpartners.com
|
||||
halcyon-aboveboard.com
|
||||
hanzubon.org
|
||||
healthfuljourneyjoy.com
|
||||
hgnbroken.us.com
|
||||
highwey-diesel.com
|
||||
hirofactory.com
|
||||
hjd.asso.fr
|
||||
hongchenggco.pro
|
||||
hongkongtaxi.co
|
||||
hopsinthehanger.com
|
||||
hosted-by-worldstream.net
|
||||
hostelsucre.com
|
||||
hosting1337.com
|
||||
hostinghane.com
|
||||
hostinglotus.cloud
|
||||
hostingmichigan.com
|
||||
hostiran.name
|
||||
hostmnl.com
|
||||
hostname.localhost
|
||||
hostnetwork.com
|
||||
hosts.net.nz
|
||||
hostserv.eu
|
||||
hostwhitelabel.com
|
||||
hpms1.jp
|
||||
hunariojmk.net
|
||||
hunriokinmuim.net
|
||||
hypericine.com
|
||||
i-mecca.net
|
||||
iaasdns.com
|
||||
iam.net.ma
|
||||
iconmarketingguy.com
|
||||
idcfcloud.net
|
||||
idealconcept.live
|
||||
igmohji.com
|
||||
igppevents.org.uk
|
||||
ihglobaldns.com
|
||||
ilmessicano.com
|
||||
imjtmn.cn
|
||||
immenzaces.com
|
||||
in-addr-arpa
|
||||
in-addr.arpa
|
||||
indsalelimited.com
|
||||
indulgent-holistic.com
|
||||
industechint.org
|
||||
inshaaegypt.com
|
||||
intal.uz
|
||||
interfarma.kz
|
||||
intocpanel.com
|
||||
ip-147-135-108.us
|
||||
ip-178-33-109.eu
|
||||
ip-ptr.tech
|
||||
iswhatpercent.com
|
||||
itsidc.com
|
||||
itwebs.com
|
||||
iuon.net
|
||||
ivol.co
|
||||
jalanet.co.id
|
||||
jimishare.com
|
||||
jlccptt.net.cn
|
||||
jlenterprises.co.uk
|
||||
jmontalto.com
|
||||
joyomokei.com
|
||||
jumanra.org
|
||||
justlongshirts.com
|
||||
kahlaa.com
|
||||
kaw.theworkpc.com
|
||||
kbronet.com.tw
|
||||
kdnursing.org
|
||||
kielnet.net
|
||||
kihy.theworkpc.com
|
||||
kingschurchwirral.org
|
||||
kitchenaildbd.com
|
||||
klaomi.shop
|
||||
knkconsult.net
|
||||
kohshikai.com
|
||||
krhfund.org
|
||||
krillaglass.com
|
||||
lancorhomes.com
|
||||
landpedia.org
|
||||
lanzatuseo.es
|
||||
layerdns.cloud
|
||||
learninglinked.com
|
||||
legenditds.com
|
||||
levertechcentre.com
|
||||
lhost.no
|
||||
lideri.net.br
|
||||
lighthouse-media.com
|
||||
lightpath.net
|
||||
limogesporcelainboxes.com
|
||||
lindsaywalt.net
|
||||
linuxsunucum.com
|
||||
listertermoformadoa.com
|
||||
llsend.com
|
||||
local.net
|
||||
lohkal.com
|
||||
londionrtim.net
|
||||
lonestarmm.net
|
||||
longmarquis.com
|
||||
longwoodmgmt.com
|
||||
lse.kz
|
||||
lunvoy.com
|
||||
luxarpro.ru
|
||||
lwl-puehringer.at
|
||||
lynx.net.lb
|
||||
lyse.net
|
||||
m-sender.com.ua
|
||||
maggiolicloud.it
|
||||
magnetmail.net
|
||||
magnumgo.uz
|
||||
maia11.com
|
||||
mail-fire.com
|
||||
mailsentinel.net
|
||||
mailset.cn
|
||||
malardino.net
|
||||
managed-vps.net
|
||||
manhattanbulletpoint.com
|
||||
manpowerservices.com
|
||||
marketmysterycode.com
|
||||
marketwizardspro.com
|
||||
masterclassjournal.com
|
||||
matroguel.cam
|
||||
maximpactipo.com
|
||||
mechanicalwalk.store
|
||||
mediavobis.com
|
||||
meqlobal.com
|
||||
mgts.by
|
||||
migrans.net
|
||||
miixta.com
|
||||
milleniumsrv.com
|
||||
mindworksunlimited.com
|
||||
mirth-gale.com
|
||||
misorpresa.com
|
||||
mitomobile.com
|
||||
mitsubachi-kibako.net
|
||||
mjinn.com
|
||||
mkegs.shop
|
||||
mobius.fr
|
||||
model-ac.ink
|
||||
moderntradingnews.com
|
||||
monnaiegroup.com
|
||||
monopolizeright.com
|
||||
moonjaws.com
|
||||
morningnewscatcher.com
|
||||
motion4ever.net
|
||||
mschosting.com
|
||||
msdp1.com
|
||||
mspnet.pro
|
||||
mts-nn.ru
|
||||
multifamilydesign.com
|
||||
mxserver.ro
|
||||
mxthunder.net
|
||||
my-ihor.ru
|
||||
mycloudmailbox.com
|
||||
myfriendforum.com
|
||||
myrewards.net
|
||||
mysagestore.com
|
||||
mysecurewebserver.com
|
||||
myshanet.net
|
||||
myvps.jp
|
||||
mywedsite.net
|
||||
mywic.eu
|
||||
name.tools
|
||||
nanshenqfurniture.com
|
||||
nask.pl
|
||||
navertise.net
|
||||
ncbb.kz
|
||||
ncport.ru
|
||||
ncsdi.ws
|
||||
nebdig.com
|
||||
neovet-base.ru
|
||||
netbri.com
|
||||
netcentertelecom.net.br
|
||||
neti.ee
|
||||
netkl.org
|
||||
newinvestingguide.com
|
||||
newwallstreetcode.com
|
||||
ngvcv.cn
|
||||
nic.name
|
||||
nidix.net
|
||||
nieuwedagnetwerk.net
|
||||
nlscanme.com
|
||||
nmeuh.cn
|
||||
noisndametal.com
|
||||
nucleusemail.com
|
||||
nutriboostlife.com
|
||||
nwo.giize.com
|
||||
nwwhalewatchers.org
|
||||
ny.adsl
|
||||
nyt1.com
|
||||
offerslatedeals.com
|
||||
office365.us
|
||||
ogicom.net
|
||||
olivettilexikon.co.uk
|
||||
omegabrasil.inf.br
|
||||
onnet21.com
|
||||
onumubunumu.com
|
||||
oppt-ac.fit
|
||||
orbitel.net.co
|
||||
orfsurface.com
|
||||
orientalspot.com
|
||||
outsidences.com
|
||||
ovaltinalization.co
|
||||
overta.ru
|
||||
ox28vgrurc.com
|
||||
pamulang.net
|
||||
panaltyspot.space
|
||||
panolacountysheriffms.com
|
||||
passionatesmiles.com
|
||||
paulinelam.com
|
||||
pdi-corp.com
|
||||
peloquinbeck.com
|
||||
perimetercenter.net
|
||||
permanentscreen.com
|
||||
permasteellisagroup.com
|
||||
perumkijhyu.net
|
||||
pesnia.com.ua
|
||||
ph8ltwdi12o.com
|
||||
pharmada.com.de
|
||||
phdns3.es
|
||||
pigelixval1.com
|
||||
pipefittingsindia.com
|
||||
planethoster.net
|
||||
playamedia.io
|
||||
plesk.page
|
||||
pmnhost.net
|
||||
pokiloandhu.net
|
||||
pokupki5.ru
|
||||
polandi.net
|
||||
popiup.com
|
||||
ports.net
|
||||
posolstvostilya.com
|
||||
potia.net
|
||||
prima.com.ar
|
||||
prima.net.ar
|
||||
profsol.co.uk
|
||||
prohealthmotion.com
|
||||
promooffermarket.site
|
||||
proudserver.com
|
||||
proxado.com
|
||||
psnm.ru
|
||||
pvcwindowsprices.live
|
||||
qontenciplc.autos
|
||||
quakeclick.com
|
||||
quasarstate.store
|
||||
quatthonggiotico.com
|
||||
qxyxab44njd.com
|
||||
radianthealthrenaissance.com
|
||||
rapidns.com
|
||||
raxa.host
|
||||
reberte.com
|
||||
reethvikintl.com
|
||||
regruhosting.ru
|
||||
reliablepanel.com
|
||||
rgb365.eu
|
||||
riddlecamera.net
|
||||
riddletrends.com
|
||||
roccopugliese.com
|
||||
runnin-rebels.com
|
||||
rupar.puglia.it
|
||||
rwdhosting.ca
|
||||
s500host.com
|
||||
sageevents.co.ke
|
||||
sahacker-2020.com
|
||||
samsales.site
|
||||
sante-lorraine.fr
|
||||
saransk.ru
|
||||
satirogluet.com
|
||||
scioncontacts.com
|
||||
sdcc.my
|
||||
seaspraymta3.net
|
||||
secorp.mx
|
||||
securen.net
|
||||
securerelay.in
|
||||
securev.net
|
||||
seductiveeyes.com
|
||||
seizethedayconsulting.com
|
||||
serroplast.shop
|
||||
server290.com
|
||||
server342.com
|
||||
server3559.cc
|
||||
servershost.biz
|
||||
sfek.kz
|
||||
sgnetway.net
|
||||
shopfox.ca
|
||||
silvestrejaguar.sbs
|
||||
silvestreonca.sbs
|
||||
simplediagnostics.org
|
||||
siriuscloud.jp
|
||||
sisglobalresearch.com
|
||||
sixpacklink.net
|
||||
sjestyle.com
|
||||
smallvillages.com
|
||||
smartape-vps.com
|
||||
solusoftware.com
|
||||
sourcedns.com
|
||||
southcoastwebhosting12.com
|
||||
specialtvvs.com
|
||||
spiritualtechnologies.io
|
||||
sprout.org
|
||||
srv.cat
|
||||
stableserver.net
|
||||
statlerfa.co.uk
|
||||
stock-smtp.top
|
||||
stockepictigers.com
|
||||
stockexchangejournal.com
|
||||
subterranean-concave.com
|
||||
suksangroup.com
|
||||
swissbluetopaz.com
|
||||
switer.shop
|
||||
sysop4.com
|
||||
system.eu.com
|
||||
szhongbing.com
|
||||
t-jon.com
|
||||
tacaindo.net
|
||||
tacom.tj
|
||||
tankertelz.co
|
||||
tataidc.com
|
||||
teamveiw.com
|
||||
tecnoxia.net
|
||||
tel-xyz.fit
|
||||
tenkids.net
|
||||
terminavalley.com
|
||||
thaicloudsolutions.com
|
||||
thaikinghost.com
|
||||
thaimonster.com
|
||||
thegermainetruth.net
|
||||
thehandmaderose.com
|
||||
thepushcase.com
|
||||
ticdns.com
|
||||
tigo.bo
|
||||
toledofibra.net.br
|
||||
topdns.com
|
||||
totaal.net
|
||||
totalplay.net
|
||||
tqh.ro
|
||||
traderlearningcenter.com
|
||||
tradeukraine.site
|
||||
traveleza.com
|
||||
trwww.com
|
||||
tsuzakij.com
|
||||
tullostrucking.com
|
||||
turbinetrends.com
|
||||
twincitiesdistinctivehomes.com
|
||||
tylerfordonline.com
|
||||
uiyum.com
|
||||
ultragate.com
|
||||
uneedacollie.com
|
||||
unified.services
|
||||
unite.services
|
||||
urawasl.com
|
||||
us.servername.us
|
||||
vagebond.net
|
||||
varvia.de
|
||||
vbcploo.com
|
||||
vdc.vn
|
||||
vendimetry.com
|
||||
vibrantwellnesscorp.com
|
||||
virtualine.org
|
||||
visit.docotor
|
||||
viviotech.us
|
||||
vlflgl.com
|
||||
volganet.ru
|
||||
vrns.net
|
||||
vulterdi.edu
|
||||
vvondertex.com
|
||||
wallstreetsgossip.com
|
||||
wamego.net
|
||||
wanekoohost.com
|
||||
wealthexpertisepro.com
|
||||
web-login.eu
|
||||
weblinkinternational.com
|
||||
webnox.io
|
||||
websale.net
|
||||
welllivinghive.com
|
||||
westparkcom.com
|
||||
wetransfer-eu.com
|
||||
wheelch.me
|
||||
whoflew.com
|
||||
whpservers.com
|
||||
wisdomhard.com
|
||||
wisewealthcircle.com
|
||||
wisvis.com
|
||||
wodeniowa.com
|
||||
wordpresshosting.xyz
|
||||
wsiph2.com
|
||||
xnt.mx
|
||||
xodiax.com
|
||||
xpnuf.cn
|
||||
xsfati.us.com
|
||||
xspmail.jp
|
||||
yourciviccompass.com
|
||||
yourinvestworkbook.com
|
||||
yoursitesecure.net
|
||||
zerowebhosting.net
|
||||
zmml.uk
|
||||
znlc.jp
|
||||
ztomy.com
|
||||
23
parsedmarc/resources/maps/psl_overrides.txt
Normal file
23
parsedmarc/resources/maps/psl_overrides.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
-applefibernet.com
|
||||
-c3.net.pl
|
||||
-celsiainternet.com
|
||||
-clientes-izzi.mx
|
||||
-clientes-zap-izzi.mx
|
||||
-imnet.com.br
|
||||
-mcnbd.com
|
||||
-smile.com.bd
|
||||
-tataidc.co.in
|
||||
-veloxfiber.com.br
|
||||
-wconect.com.br
|
||||
.amazonaws.com
|
||||
.cloudaccess.net
|
||||
.ddnsgeek.com
|
||||
.fastvps-server.com
|
||||
.in-addr-arpa
|
||||
.in-addr.arpa
|
||||
.kasserver.com
|
||||
.kinghost.net
|
||||
.linode.com
|
||||
.linodeusercontent.com
|
||||
.na4u.ru
|
||||
.sakura.ne.jp
|
||||
184
parsedmarc/resources/maps/sortlists.py
Executable file
184
parsedmarc/resources/maps/sortlists.py
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from typing import Mapping, Iterable, Optional, Collection, Union, List, Dict
|
||||
|
||||
|
||||
class CSVValidationError(Exception):
|
||||
def __init__(self, errors: list[str]):
|
||||
super().__init__("\n".join(errors))
|
||||
self.errors = errors
|
||||
|
||||
|
||||
def sort_csv(
|
||||
filepath: Union[str, Path],
|
||||
field: str,
|
||||
*,
|
||||
sort_field_value_must_be_unique: bool = True,
|
||||
strip_whitespace: bool = True,
|
||||
fields_to_lowercase: Optional[Iterable[str]] = None,
|
||||
case_insensitive_sort: bool = False,
|
||||
required_fields: Optional[Iterable[str]] = None,
|
||||
allowed_values: Optional[Mapping[str, Collection[str]]] = None,
|
||||
) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Read a CSV, optionally normalize rows (strip whitespace, lowercase certain fields),
|
||||
validate field values, and write the sorted CSV back to the same path.
|
||||
|
||||
- filepath: Path to the CSV to sort.
|
||||
- field: The field name to sort by.
|
||||
- fields_to_lowercase: Permanently lowercases these field(s) in the data.
|
||||
- strip_whitespace: Remove all whitespace at the beginning and of field values.
|
||||
- case_insensitive_sort: Ignore case when sorting without changing values.
|
||||
- required_fields: A list of fields that must have data in all rows.
|
||||
- allowed_values: A mapping of allowed values for fields.
|
||||
"""
|
||||
path = Path(filepath)
|
||||
required_fields = set(required_fields or [])
|
||||
lower_set = set(fields_to_lowercase or [])
|
||||
allowed_sets = {k: set(v) for k, v in (allowed_values or {}).items()}
|
||||
if sort_field_value_must_be_unique:
|
||||
seen_sort_field_values = []
|
||||
|
||||
with path.open("r", newline="") as infile:
|
||||
reader = csv.DictReader(infile)
|
||||
fieldnames = reader.fieldnames or []
|
||||
if field not in fieldnames:
|
||||
raise CSVValidationError([f"Missing sort column: {field!r}"])
|
||||
missing_headers = required_fields - set(fieldnames)
|
||||
if missing_headers:
|
||||
raise CSVValidationError(
|
||||
[f"Missing required header(s): {sorted(missing_headers)}"]
|
||||
)
|
||||
rows = list(reader)
|
||||
|
||||
def normalize_row(row: Dict[str, str]) -> None:
|
||||
if strip_whitespace:
|
||||
for k, v in row.items():
|
||||
if isinstance(v, str):
|
||||
row[k] = v.strip()
|
||||
for fld in lower_set:
|
||||
if fld in row and isinstance(row[fld], str):
|
||||
row[fld] = row[fld].lower()
|
||||
|
||||
def validate_row(
|
||||
row: Dict[str, str], sort_field: str, line_no: int, errors: list[str]
|
||||
) -> None:
|
||||
if sort_field_value_must_be_unique:
|
||||
if row[sort_field] in seen_sort_field_values:
|
||||
errors.append(f"Line {line_no}: Duplicate row for '{row[sort_field]}'")
|
||||
else:
|
||||
seen_sort_field_values.append(row[sort_field])
|
||||
for rf in required_fields:
|
||||
val = row.get(rf)
|
||||
if val is None or val == "":
|
||||
errors.append(
|
||||
f"Line {line_no}: Missing value for required field '{rf}'"
|
||||
)
|
||||
for field, allowed_values in allowed_sets.items():
|
||||
if field in row:
|
||||
val = row[field]
|
||||
if val not in allowed_values:
|
||||
errors.append(
|
||||
f"Line {line_no}: '{val}' is not an allowed value for '{field}' "
|
||||
f"(allowed: {sorted(allowed_values)})"
|
||||
)
|
||||
|
||||
errors: list[str] = []
|
||||
for idx, row in enumerate(rows, start=2): # header is line 1
|
||||
normalize_row(row)
|
||||
validate_row(row, field, idx, errors)
|
||||
|
||||
if errors:
|
||||
raise CSVValidationError(errors)
|
||||
|
||||
def sort_key(r: Dict[str, str]):
|
||||
v = r.get(field, "")
|
||||
if isinstance(v, str) and case_insensitive_sort:
|
||||
return v.casefold()
|
||||
return v
|
||||
|
||||
rows.sort(key=sort_key)
|
||||
|
||||
with open(filepath, "w", newline="") as outfile:
|
||||
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def sort_list_file(
|
||||
filepath: Union[str, Path],
|
||||
*,
|
||||
lowercase: bool = True,
|
||||
strip: bool = True,
|
||||
deduplicate: bool = True,
|
||||
remove_blank_lines: bool = True,
|
||||
ending_newline: bool = True,
|
||||
newline: Optional[str] = "\n",
|
||||
):
|
||||
"""Read a list from a file, sort it, optionally strip and deduplicate the values,
|
||||
then write that list back to the file.
|
||||
|
||||
- Filepath: The path to the file.
|
||||
- lowercase: Lowercase all values prior to sorting.
|
||||
- remove_blank_lines: Remove any plank lines.
|
||||
- ending_newline: End the file with a newline, even if remove_blank_lines is true.
|
||||
- newline: The newline character to use.
|
||||
"""
|
||||
with open(filepath, mode="r", newline=newline) as infile:
|
||||
lines = infile.readlines()
|
||||
for i in range(len(lines)):
|
||||
if lowercase:
|
||||
lines[i] = lines[i].lower()
|
||||
if strip:
|
||||
lines[i] = lines[i].strip()
|
||||
if deduplicate:
|
||||
lines = list(set(lines))
|
||||
if remove_blank_lines:
|
||||
while "" in lines:
|
||||
lines.remove("")
|
||||
lines = sorted(lines)
|
||||
if ending_newline:
|
||||
if lines[-1] != "":
|
||||
lines.append("")
|
||||
with open(filepath, mode="w", newline=newline) as outfile:
|
||||
outfile.write("\n".join(lines))
|
||||
|
||||
|
||||
def _main():
|
||||
map_file = "base_reverse_dns_map.csv"
|
||||
map_key = "base_reverse_dns"
|
||||
list_files = ["known_unknown_base_reverse_dns.txt", "psl_overrides.txt"]
|
||||
types_file = "base_reverse_dns_types.txt"
|
||||
|
||||
with open(types_file) as f:
|
||||
types = f.readlines()
|
||||
while "" in types:
|
||||
types.remove("")
|
||||
|
||||
map_allowed_values = {"Type": types}
|
||||
|
||||
for list_file in list_files:
|
||||
if not os.path.exists(list_file):
|
||||
print(f"Error: {list_file} does not exist")
|
||||
exit(1)
|
||||
sort_list_file(list_file)
|
||||
if not os.path.exists(types_file):
|
||||
print(f"Error: {types_file} does not exist")
|
||||
exit(1)
|
||||
sort_list_file(types_file, lowercase=False)
|
||||
if not os.path.exists(map_file):
|
||||
print(f"Error: {map_file} does not exist")
|
||||
exit(1)
|
||||
try:
|
||||
sort_csv(map_file, map_key, allowed_values=map_allowed_values)
|
||||
except CSVValidationError as e:
|
||||
print(f"{map_file} did not validate: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_main()
|
||||
@@ -5,7 +5,7 @@ import json
|
||||
import urllib3
|
||||
import requests
|
||||
|
||||
from parsedmarc import __version__
|
||||
from parsedmarc.constants import USER_AGENT
|
||||
from parsedmarc.log import logger
|
||||
from parsedmarc.utils import human_timestamp_to_unix_timestamp
|
||||
|
||||
@@ -51,7 +51,7 @@ class HECClient(object):
|
||||
self._common_data = dict(host=self.host, source=self.source, index=self.index)
|
||||
|
||||
self.session.headers = {
|
||||
"User-Agent": "parsedmarc/{0}".format(__version__),
|
||||
"User-Agent": USER_AGENT,
|
||||
"Authorization": "Splunk {0}".format(self.access_token),
|
||||
}
|
||||
|
||||
@@ -78,6 +78,9 @@ class HECClient(object):
|
||||
new_report = dict()
|
||||
for metadata in report["report_metadata"]:
|
||||
new_report[metadata] = report["report_metadata"][metadata]
|
||||
new_report["interval_begin"] = record["interval_begin"]
|
||||
new_report["interval_end"] = record["interval_end"]
|
||||
new_report["normalized_timespan"] = record["normalized_timespan"]
|
||||
new_report["published_policy"] = report["policy_published"]
|
||||
new_report["source_ip_address"] = record["source"]["ip_address"]
|
||||
new_report["source_country"] = record["source"]["country"]
|
||||
@@ -98,7 +101,9 @@ class HECClient(object):
|
||||
new_report["spf_results"] = record["auth_results"]["spf"]
|
||||
|
||||
data["sourcetype"] = "dmarc:aggregate"
|
||||
timestamp = human_timestamp_to_unix_timestamp(new_report["begin_date"])
|
||||
timestamp = human_timestamp_to_unix_timestamp(
|
||||
new_report["interval_begin"]
|
||||
)
|
||||
data["time"] = timestamp
|
||||
data["event"] = new_report.copy()
|
||||
json_str += "{0}\n".format(json.dumps(data))
|
||||
|
||||
@@ -19,10 +19,11 @@ import csv
|
||||
import io
|
||||
|
||||
try:
|
||||
import importlib.resources as pkg_resources
|
||||
from importlib.resources import files
|
||||
except ImportError:
|
||||
# Try backported to PY<37 `importlib_resources`
|
||||
import importlib_resources as pkg_resources
|
||||
# Try backported to PY<3 `importlib_resources`
|
||||
from importlib.resources import files
|
||||
|
||||
|
||||
from dateutil.parser import parse as parse_date
|
||||
import dns.reversename
|
||||
@@ -36,13 +37,19 @@ import requests
|
||||
from parsedmarc.log import logger
|
||||
import parsedmarc.resources.dbip
|
||||
import parsedmarc.resources.maps
|
||||
|
||||
from parsedmarc.constants import USER_AGENT
|
||||
|
||||
parenthesis_regex = re.compile(r"\s*\(.*\)\s*")
|
||||
|
||||
null_file = open(os.devnull, "w")
|
||||
mailparser_logger = logging.getLogger("mailparser")
|
||||
mailparser_logger.setLevel(logging.CRITICAL)
|
||||
psl = publicsuffixlist.PublicSuffixList()
|
||||
psl_overrides_path = str(files(parsedmarc.resources.maps).joinpath("psl_overrides.txt"))
|
||||
with open(psl_overrides_path) as f:
|
||||
psl_overrides = [line.rstrip() for line in f.readlines()]
|
||||
while "" in psl_overrides:
|
||||
psl_overrides.remove("")
|
||||
|
||||
|
||||
class EmailParserError(RuntimeError):
|
||||
@@ -77,7 +84,8 @@ def get_base_domain(domain):
|
||||
|
||||
.. note::
|
||||
Results are based on a list of public domain suffixes at
|
||||
https://publicsuffix.org/list/public_suffix_list.dat.
|
||||
https://publicsuffix.org/list/public_suffix_list.dat and overrides included in
|
||||
parsedmarc.resources.maps.psl_overrides.txt
|
||||
|
||||
Args:
|
||||
domain (str): A domain or subdomain
|
||||
@@ -86,8 +94,12 @@ def get_base_domain(domain):
|
||||
str: The base domain of the given domain
|
||||
|
||||
"""
|
||||
psl = publicsuffixlist.PublicSuffixList()
|
||||
return psl.privatesuffix(domain)
|
||||
domain = domain.lower()
|
||||
publicsuffix = psl.privatesuffix(domain)
|
||||
for override in psl_overrides:
|
||||
if domain.endswith(override):
|
||||
return override.strip(".").strip("-")
|
||||
return publicsuffix
|
||||
|
||||
|
||||
def query_dns(domain, record_type, cache=None, nameservers=None, timeout=2.0):
|
||||
@@ -280,14 +292,13 @@ def get_ip_address_country(ip_address, db_path=None):
|
||||
break
|
||||
|
||||
if db_path is None:
|
||||
with pkg_resources.path(
|
||||
parsedmarc.resources.dbip, "dbip-country-lite.mmdb"
|
||||
) as path:
|
||||
db_path = path
|
||||
db_path = str(
|
||||
files(parsedmarc.resources.dbip).joinpath("dbip-country-lite.mmdb")
|
||||
)
|
||||
|
||||
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
|
||||
if db_age > timedelta(days=30):
|
||||
logger.warning("IP database is more than a month old")
|
||||
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
|
||||
if db_age > timedelta(days=30):
|
||||
logger.warning("IP database is more than a month old")
|
||||
|
||||
db_reader = geoip2.database.Reader(db_path)
|
||||
|
||||
@@ -344,21 +355,30 @@ def get_service_from_reverse_dns_base_domain(
|
||||
|
||||
if not (offline or always_use_local_file) and len(reverse_dns_map) == 0:
|
||||
try:
|
||||
logger.debug(f"Trying to fetch " f"reverse DNS map from {url}...")
|
||||
csv_file.write(requests.get(url).text)
|
||||
logger.debug(f"Trying to fetch reverse DNS map from {url}...")
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
csv_file.write(response.text)
|
||||
csv_file.seek(0)
|
||||
load_csv(csv_file)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Failed to fetch reverse DNS map: {e}")
|
||||
except Exception:
|
||||
logger.warning("Not a valid CSV file")
|
||||
csv_file.seek(0)
|
||||
logging.debug("Response body:")
|
||||
logger.debug(csv_file.read())
|
||||
|
||||
if len(reverse_dns_map) == 0:
|
||||
logger.info("Loading included reverse DNS map...")
|
||||
with pkg_resources.path(
|
||||
parsedmarc.resources.maps, "base_reverse_dns_map.csv"
|
||||
) as path:
|
||||
if local_file_path is not None:
|
||||
path = local_file_path
|
||||
with open(path) as csv_file:
|
||||
load_csv(csv_file)
|
||||
path = str(
|
||||
files(parsedmarc.resources.maps).joinpath("base_reverse_dns_map.csv")
|
||||
)
|
||||
if local_file_path is not None:
|
||||
path = local_file_path
|
||||
with open(path) as csv_file:
|
||||
load_csv(csv_file)
|
||||
try:
|
||||
service = reverse_dns_map[base_domain]
|
||||
except KeyError:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import requests
|
||||
|
||||
from parsedmarc import logger
|
||||
from parsedmarc.constants import USER_AGENT
|
||||
|
||||
|
||||
class WebhookClient(object):
|
||||
@@ -21,7 +22,7 @@ class WebhookClient(object):
|
||||
self.timeout = timeout
|
||||
self.session = requests.Session()
|
||||
self.session.headers = {
|
||||
"User-Agent": "parsedmarc",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[build-system]
|
||||
requires = [
|
||||
"hatchling>=1.8.1",
|
||||
"hatchling>=1.27.0",
|
||||
]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
@@ -55,11 +55,12 @@ dependencies = [
|
||||
"tqdm>=4.31.1",
|
||||
"urllib3>=1.25.7",
|
||||
"xmltodict>=0.12.0",
|
||||
"PyYAML>=6.0.3"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
build = [
|
||||
"hatch",
|
||||
"hatch>=1.14.0",
|
||||
"myst-parser[linkify]",
|
||||
"nose",
|
||||
"pytest",
|
||||
@@ -76,9 +77,20 @@ parsedmarc = "parsedmarc.cli:_main"
|
||||
Homepage = "https://domainaware.github.io/parsedmarc"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "parsedmarc/__init__.py"
|
||||
path = "parsedmarc/constants.py"
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
include = [
|
||||
"/parsedmarc",
|
||||
]
|
||||
|
||||
[tool.hatch.build]
|
||||
exclude = [
|
||||
"base_reverse_dns.csv",
|
||||
"find_bad_utf8.py",
|
||||
"find_unknown_base_reverse_dns.py",
|
||||
"unknown_base_reverse_dns.csv",
|
||||
"sortmaps.py",
|
||||
"README.md",
|
||||
"*.bak"
|
||||
]
|
||||
|
||||
107
splunk/smtp_tls_dashboard.xml
Normal file
107
splunk/smtp_tls_dashboard.xml
Normal file
@@ -0,0 +1,107 @@
|
||||
<form version="1.1" theme="dark">
|
||||
<label>SMTP TLS Reporting</label>
|
||||
<fieldset submitButton="false" autoRun="true">
|
||||
<input type="time" token="time">
|
||||
<label></label>
|
||||
<default>
|
||||
<earliest>-7d@h</earliest>
|
||||
<latest>now</latest>
|
||||
</default>
|
||||
</input>
|
||||
<input type="text" token="organization_name" searchWhenChanged="true">
|
||||
<label>Organization name</label>
|
||||
<default>*</default>
|
||||
<initialValue>*</initialValue>
|
||||
</input>
|
||||
<input type="text" token="policy_domain">
|
||||
<label>Policy domain</label>
|
||||
<default>*</default>
|
||||
<initialValue>*</initialValue>
|
||||
</input>
|
||||
<input type="dropdown" token="policy_type" searchWhenChanged="true">
|
||||
<label>Policy type</label>
|
||||
<choice value="*">Any</choice>
|
||||
<choice value="tlsa">tlsa</choice>
|
||||
<choice value="sts">sts</choice>
|
||||
<choice value="no-policy-found">no-policy-found</choice>
|
||||
<default>*</default>
|
||||
<initialValue>*</initialValue>
|
||||
</input>
|
||||
</fieldset>
|
||||
<row>
|
||||
<panel>
|
||||
<title>Reporting organizations</title>
|
||||
<table>
|
||||
<search>
|
||||
<query>index=email sourcetype=smtp:tls organization_name=$organization_name$ policies{}.policy_domain=$policy_domain$
|
||||
| rename policies{}.policy_domain as policy_domain
|
||||
| rename policies{}.policy_type as policy_type
|
||||
| rename policies{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.failure_details{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.successful_session_count as successful_sessions
|
||||
| rename policies{}.failure_details{}.sending_mta_ip as sending_mta_ip
|
||||
| rename policies{}.failure_details{}.receiving_ip as receiving_ip
|
||||
| rename policies{}.failure_details{}.receiving_mx_hostname as receiving_mx_hostname
|
||||
| rename policies{}.failure_details{}.result_type as failure_type
|
||||
| fillnull value=0 failed_sessions
|
||||
| stats sum(failed_sessions) as failed_sessions sum(successful_sessions) as successful_sessions by organization_name
|
||||
| sort -successful_sessions 0</query>
|
||||
<earliest>$time.earliest$</earliest>
|
||||
<latest>$time.latest$</latest>
|
||||
</search>
|
||||
<option name="drilldown">none</option>
|
||||
<option name="refresh.display">progressbar</option>
|
||||
</table>
|
||||
</panel>
|
||||
<panel>
|
||||
<title>Domains</title>
|
||||
<table>
|
||||
<search>
|
||||
<query>index=email sourcetype=smtp:tls organization_name=$organization_name$ policies{}.policy_domain=$policy_domain$
|
||||
| rename policies{}.policy_domain as policy_domain
|
||||
| rename policies{}.policy_type as policy_type
|
||||
| rename policies{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.failure_details{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.successful_session_count as successful_sessions
|
||||
| rename policies{}.failure_details{}.sending_mta_ip as sending_mta_ip
|
||||
| rename policies{}.failure_details{}.receiving_ip as receiving_ip
|
||||
| rename policies{}.failure_details{}.receiving_mx_hostname as receiving_mx_hostname
|
||||
| rename policies{}.failure_details{}.result_type as failure_type
|
||||
| fillnull value=0 failed_sessions
|
||||
| stats sum(failed_sessions) as failed_sessions sum(successful_sessions) as successful_sessions by policy_domain
|
||||
| sort -successful_sessions 0</query>
|
||||
<earliest>$time.earliest$</earliest>
|
||||
<latest>$time.latest$</latest>
|
||||
</search>
|
||||
<option name="drilldown">none</option>
|
||||
<option name="refresh.display">progressbar</option>
|
||||
</table>
|
||||
</panel>
|
||||
</row>
|
||||
<row>
|
||||
<panel>
|
||||
<title>Failure details</title>
|
||||
<table>
|
||||
<search>
|
||||
<query>index=email sourcetype=smtp:tls organization_name=$organization_name$ policies{}.policy_domain=$policy_domain$ policies{}.failure_details{}.result_type=*
|
||||
| rename policies{}.policy_domain as policy_domain
|
||||
| rename policies{}.policy_type as policy_type
|
||||
| rename policies{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.failure_details{}.failed_session_count as failed_sessions
|
||||
| rename policies{}.successful_session_count as successful_sessions
|
||||
| rename policies{}.failure_details{}.sending_mta_ip as sending_mta_ip
|
||||
| rename policies{}.failure_details{}.receiving_ip as receiving_ip
|
||||
| rename policies{}.failure_details{}.receiving_mx_hostname as receiving_mx_hostname
|
||||
| fillnull value=0 failed_sessions
|
||||
| rename policies{}.failure_details{}.result_type as failure_type
|
||||
| table _time organization_name policy_domain policy_type failed_sessions successful_sessions sending_mta_ip receiving_ip receiving_mx_hostname failure_type
|
||||
| sort by -_time 0</query>
|
||||
<earliest>$time.earliest$</earliest>
|
||||
<latest>$time.latest$</latest>
|
||||
</search>
|
||||
<option name="drilldown">none</option>
|
||||
<option name="refresh.display">progressbar</option>
|
||||
</table>
|
||||
</panel>
|
||||
</row>
|
||||
</form>
|
||||
34
tests.py
34
tests.py
@@ -43,11 +43,12 @@ class Test(unittest.TestCase):
|
||||
|
||||
def testExtractReportXMLComparator(self):
|
||||
"""Test XML comparator function"""
|
||||
print()
|
||||
xmlnice = open("samples/extract_report/nice-input.xml").read()
|
||||
print(xmlnice)
|
||||
xmlchanged = minify_xml(open("samples/extract_report/changed-input.xml").read())
|
||||
print(xmlchanged)
|
||||
xmlnice_file = open("samples/extract_report/nice-input.xml")
|
||||
xmlnice = xmlnice_file.read()
|
||||
xmlnice_file.close()
|
||||
xmlchanged_file = open("samples/extract_report/changed-input.xml")
|
||||
xmlchanged = minify_xml(xmlchanged_file.read())
|
||||
xmlchanged_file.close()
|
||||
self.assertTrue(compare_xml(xmlnice, xmlnice))
|
||||
self.assertTrue(compare_xml(xmlchanged, xmlchanged))
|
||||
self.assertFalse(compare_xml(xmlnice, xmlchanged))
|
||||
@@ -62,7 +63,9 @@ class Test(unittest.TestCase):
|
||||
data = f.read()
|
||||
print("Testing {0}: ".format(file), end="")
|
||||
xmlout = parsedmarc.extract_report(data)
|
||||
xmlin = open("samples/extract_report/nice-input.xml").read()
|
||||
xmlin_file = open("samples/extract_report/nice-input.xml")
|
||||
xmlin = xmlin_file.read()
|
||||
xmlin_file.close()
|
||||
self.assertTrue(compare_xml(xmlout, xmlin))
|
||||
print("Passed!")
|
||||
|
||||
@@ -72,7 +75,9 @@ class Test(unittest.TestCase):
|
||||
file = "samples/extract_report/nice-input.xml"
|
||||
print("Testing {0}: ".format(file), end="")
|
||||
xmlout = parsedmarc.extract_report(file)
|
||||
xmlin = open("samples/extract_report/nice-input.xml").read()
|
||||
xmlin_file = open("samples/extract_report/nice-input.xml")
|
||||
xmlin = xmlin_file.read()
|
||||
xmlin_file.close()
|
||||
self.assertTrue(compare_xml(xmlout, xmlin))
|
||||
print("Passed!")
|
||||
|
||||
@@ -82,7 +87,9 @@ class Test(unittest.TestCase):
|
||||
file = "samples/extract_report/nice-input.xml.gz"
|
||||
print("Testing {0}: ".format(file), end="")
|
||||
xmlout = parsedmarc.extract_report_from_file_path(file)
|
||||
xmlin = open("samples/extract_report/nice-input.xml").read()
|
||||
xmlin_file = open("samples/extract_report/nice-input.xml")
|
||||
xmlin = xmlin_file.read()
|
||||
xmlin_file.close()
|
||||
self.assertTrue(compare_xml(xmlout, xmlin))
|
||||
print("Passed!")
|
||||
|
||||
@@ -92,12 +99,13 @@ class Test(unittest.TestCase):
|
||||
file = "samples/extract_report/nice-input.xml.zip"
|
||||
print("Testing {0}: ".format(file), end="")
|
||||
xmlout = parsedmarc.extract_report_from_file_path(file)
|
||||
print(xmlout)
|
||||
xmlin = minify_xml(open("samples/extract_report/nice-input.xml").read())
|
||||
print(xmlin)
|
||||
xmlin_file = open("samples/extract_report/nice-input.xml")
|
||||
xmlin = minify_xml(xmlin_file.read())
|
||||
xmlin_file.close()
|
||||
self.assertTrue(compare_xml(xmlout, xmlin))
|
||||
xmlin = minify_xml(open("samples/extract_report/changed-input.xml").read())
|
||||
print(xmlin)
|
||||
xmlin_file = open("samples/extract_report/changed-input.xml")
|
||||
xmlin = xmlin_file.read()
|
||||
xmlin_file.close()
|
||||
self.assertFalse(compare_xml(xmlout, xmlin))
|
||||
print("Passed!")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user