Compare commits

...

86 Commits

Author SHA1 Message Date
Sean Whalen
918501ccb5 Better formatting 2025-06-02 15:20:40 -04:00
Sean Whalen
036c372ea3 8.18.2
- Merged PR #603
  - Fixes issue #595 - CI test fails for Elasticsearch
    - Moved Elasticsearch to a separate Docker service container for CI testing
    - Dropped Python 3.8 from CI testing
  - Fixes lookup and saving of DMARC forensic reports in Elasticsearch and OpenSearch
- Updated fallback `base_reverse_dns_map.csv`, which now includes over 1,400 lines
- Updated included `dbip-country-lite.mmdb` to the June 2025 release
- Automatically fall back to the internal `base_reverse_dns_map.csv` if the received file is not valid (Fixes #602)
  - Print the received data to the debug log
2025-06-02 15:19:19 -04:00
Sean Whalen
a969d83137 Update included IP database 2025-06-02 11:30:26 -04:00
Szasza Palmer
e299f7d161 fixing ES/OS forensic report lookup and storage, extracting ES to separate CI service (#603)
* fixing ES/OS forensic report lookup and storage, extracting ES to separate CI service

* bumping CI ES version to current latest

* reshuffling CI job attributes

* removing EOL Python 3.8 from the CI pipeline
2025-06-02 11:10:10 -04:00
Sean Whalen
4c04418dae Fix domain lists check 2025-04-24 16:03:18 -04:00
Sean Whalen
2ca9373ed0 Match dashboard fields 2025-04-24 15:44:22 -04:00
Sean Whalen
961ef6d804 Revert adding the BOM
It broke reading the file with python
2025-04-24 14:04:36 -04:00
Sean Whalen
573ba1e3e9 Add UTF-8 BOM to the CSV so Excel will open the file as UTF-8 2025-04-24 13:58:06 -04:00
Sean Whalen
1d8af3ccff Add find_unknown_base_reverse_dns.py 2025-04-24 13:48:51 -04:00
Sean Whalen
8426daa26b Remove duplicate domains 2025-04-24 13:47:07 -04:00
Sean Whalen
d1531b86f2 Sort known_unknown_base_reverse_dns.txt 2025-04-24 11:42:09 -04:00
Sean Whalen
8bb046798c Simplify sender types 2025-04-23 16:20:05 -04:00
Sean Whalen
d64e12548a Fix errors in the reverse DNS map 2025-04-23 15:57:31 -04:00
Sean Whalen
380479cbf1 Update reverse DNS map 2025-04-23 15:43:38 -04:00
Sean Whalen
ace21c8084 Update base_reverse_dns map .csv.and add known_unknown_base_reverse_dns.txt 2025-04-23 15:36:14 -04:00
Sean Whalen
1a1aef21ad Replace deprecated path call with file call 2025-04-23 15:33:27 -04:00
Sean Whalen
532dbbdb7e Fix file formatting 2025-04-23 15:32:04 -04:00
miles
45738ae688 Fix SyntaxError in elastic forensic report (#598) 2025-04-23 14:40:03 -04:00
Sean Whalen
9d77bd64bc Fix some CSV entries 2025-04-01 09:23:44 -04:00
Sean Whalen
140290221d Update elastic.py 2025-03-22 15:09:44 -04:00
Sean Whalen
187d61b770 Update elastic.py 2025-03-22 15:03:42 -04:00
Sean Whalen
0443b7365e Update elastic.py 2025-03-22 14:47:50 -04:00
Sean Whalen
d7b887a835 Debug elasticsearch 2025-03-22 14:42:45 -04:00
Tom Henderson
a805733221 Raise for failed status (#594) 2025-03-22 11:22:49 -04:00
Sean Whalen
9552c3ac92 Update README.md 2025-03-21 09:41:14 -04:00
Sean Whalen
5273948be0 Make build.sh usable without the gh-pages branch 2025-02-18 09:17:12 -05:00
Sean Whalen
b51756b8bd 8.18.1
- Add missing `https://` to the default Microsoft Graph URL
2025-02-17 12:41:57 -05:00
Sean Whalen
7fa7c24cb8 Merge branch 'master' of https://github.com/domainaware/parsedmarc 2025-02-17 12:31:47 -05:00
Sean Whalen
972237ae7e Fix default Microsoft Graph URL 2025-02-17 12:31:39 -05:00
Sean Whalen
6e5333a342 Style fixes 2025-02-03 16:11:21 -05:00
Sean Whalen
47b074c80b Merge branch 'master' of https://github.com/domainaware/parsedmarc 2025-02-03 16:11:01 -05:00
Sean Whalen
a1cfeb3081 8.18.0
- Add support for Microsoft national clouds via Graph API base URL (PR #590)
- Avoid stopping processing when an invalid DMARC report is encountered (PR #587)
- Increase `http.client._MAXHEADERS` from `100` to `200` to avoid errors connecting to Elasticsearch/OpenSearch (PR #589)
2025-02-03 16:10:51 -05:00
Paul Hecker
c7c451b1b1 Set http.client._MAXHEADERS to 200 (#589) 2025-02-03 15:26:15 -05:00
Kevin Goad
669deb9755 Add support for Microsoft national clouds via Graph API base URL (#590)
* adding support for Microsoft National Clouds

* Update usage.md
2025-02-03 15:25:15 -05:00
bendem
446c018920 do not stop processing when we encounter an invalid dmarc report (#587) 2025-02-03 15:20:52 -05:00
Sean Whalen
38c6f86973 Update CHANGELOG.md 2025-01-10 09:09:24 -05:00
Sean Whalen
62ccc11925 Update changelog 2025-01-09 22:25:43 -05:00
Sean Whalen
c32ca3cae3 Fix sortmaps.py 2025-01-09 22:24:03 -05:00
Sean Whalen
010f1f84a7 8.17.0
- Ignore duplicate aggregate DMARC reports with the same `org_name` and `report_id` seen within the same hour ([#539](https://github.com/domainaware/parsedmarc/issues/539))
- Fix saving SMTP TLS reports to OpenSearch (PR #585 closed issue #576)
- Add 303 entries to `base_reverse_dns_map.csv`
2025-01-09 22:22:55 -05:00
Anael Mobilia
7da57c6382 Fix colors on export.ndjson (#586)
Old elements are put on compatibility color palette => update to status color palette
2025-01-09 22:09:44 -05:00
Sean Whalen
d08e29a306 Move sortmaps.py 2025-01-09 22:08:42 -05:00
Sean Whalen
e1e53ad4cb Use Python instead of Excel for sorting map CSVs 2025-01-09 22:03:49 -05:00
Sean Whalen
4670e9687d Update base_reverse_dns_map.csv 2025-01-09 21:18:00 -05:00
Sean Whalen
7f8a2c08cd Use a smaller key value 2025-01-09 19:34:56 -05:00
Sean Whalen
e9c05dd0bf Update base_reverse_dns_map.csv 2025-01-08 20:51:44 -05:00
Sean Whalen
9348a474dd Actually fix the CLI 2025-01-08 20:49:39 -05:00
Sean Whalen
e0decaba8c Fix CLI 2025-01-07 14:33:35 -05:00
Sean Whalen
26a651cded Use a combination of report org and report ID when checking for duplicate aggregate reports 2025-01-07 14:25:57 -05:00
Sean Whalen
bcfcd93fc6 More duplicate aggregate report checks
#535
2025-01-07 13:56:26 -05:00
Sean Whalen
54d5ed3543 Remove unused import 2025-01-07 12:57:41 -05:00
Sean Whalen
1efbc87e0e Consolidate SEEN_AGGREGATE_REPORT_IDS 2025-01-07 12:56:30 -05:00
Sean Whalen
e78e7f64af Add parsedmarc.ini to .gitignore 2025-01-07 11:59:03 -05:00
Szasza Palmer
ad9de65b99 fixing SMTP TLS report saving to OpenSearch (#585) 2025-01-07 11:57:04 -05:00
Sean Whalen
b9df12700b Check for duplicate aggregate report IDs when processing a mailbox
Fix #535
2025-01-07 11:56:51 -05:00
Sean Whalen
20843b920f Sort reverse DNS map 2025-01-06 21:26:48 -05:00
Sean Whalen
e5ae89fedf Merge branch 'master' of https://github.com/domainaware/parsedmarc 2025-01-06 21:21:57 -05:00
Sean Whalen
f148cff11c Update reverse DNS map 2025-01-06 21:19:06 -05:00
Sean Whalen
4583769e04 Update reverse DNS map 2025-01-03 09:23:06 -05:00
Sean Whalen
0ecb80b27c Update reverse DNS map 2024-12-30 11:40:29 -05:00
Sean Whalen
b8e62e6d3b Remove duplicate entry 2024-12-28 14:14:00 -05:00
Sean Whalen
c67953a2c5 Update reverse DNS map 2024-12-28 14:10:39 -05:00
Sean Whalen
27dff4298c Update reverse DNS mapping 2024-12-28 11:53:50 -05:00
Sean Whalen
f2133aacd4 Fix build dependencies 2024-12-25 18:52:42 -05:00
Sean Whalen
31917e58a9 Update build backend 2024-12-25 18:28:30 -05:00
Sean Whalen
bffb98d217 Get report ID correctly 2024-12-25 16:37:40 -05:00
Sean Whalen
1f93b3a7ea Set max_len to a value 2024-12-25 16:26:38 -05:00
Sean Whalen
88debb9729 Fix SEEN_AGGREGATE_REPORT_IDS 2024-12-25 16:21:07 -05:00
Sean Whalen
a8a5564780 Merge branch 'master' of https://github.com/domainaware/parsedmarc 2024-12-25 16:14:40 -05:00
Sean Whalen
1e26f95b7b 8.16.1
- Ignore aggregate DMARC reports seen within a period of one hour (#535)
2024-12-25 16:14:33 -05:00
ericericsw
82b48e4d01 Add files via upload (#578)
update new version dashbroad

panel model change list:
grafana-piechart-panel -> pie chart
Graph(old) -> time series
worldmap panel -> geomap

some table panel has change , be like overview add ARC Column

The problem cannot be solved at the moment: Multiple DKIM information will cause table display errors
2024-12-25 16:09:43 -05:00
Sean Whalen
617b7c5b4a Merge PR #527 2024-11-09 18:18:31 -05:00
Sean Whalen
989bfd8f07 Code cleanup 2024-11-02 11:40:37 -04:00
Sean Whalen
908cc2918c Merge branch 'ramspoluri-master' 2024-11-02 11:39:34 -04:00
Sean Whalen
bd5774d71d Merge branch 'master' of https://github.com/ramspoluri/parsedmarc into ramspoluri-master 2024-11-02 11:38:41 -04:00
Sean Whalen
8e9112bad3 Merge branch 'master' of https://github.com/ramspoluri/parsedmarc 2024-11-02 10:48:15 -04:00
Sean Whalen
40e041a8af Merge branch 'master' of https://github.com/ramspoluri/parsedmarc 2024-11-02 10:48:10 -04:00
Sean Whalen
7ba433cddb Fix code style 2024-11-02 10:39:05 -04:00
Sean Whalen
6d467c93f9 Update __init__.py
Add reference to https://www.rfc-editor.org/rfc/rfc3501#page-52
2024-11-02 10:35:22 -04:00
Sean Whalen
be38e83761 Code cleanup 2024-11-02 10:28:11 -04:00
Sean Whalen
ef4e1ac8dc Code cleanup 2024-11-02 10:26:30 -04:00
Sean Whalen
39e4c22ecc Fix syntax 2024-11-02 10:23:23 -04:00
Sean Whalen
88ff3a2c23 Update syntax to support Python < 3.10 2024-11-02 10:04:01 -04:00
Sean Whalen
d8aee569f7 Update __init__.py 2024-11-02 09:50:55 -04:00
Sean Whalen
debc28cc6e 8.15.4
- Fix crash if aggregate report timespan is > 24 hours
2024-10-24 19:53:44 -04:00
Sean Whalen
52ccf0536c 8.15.3
- Ignore aggregate reports with a timespan of > 24 hours (Fixes #282)
2024-10-24 19:43:28 -04:00
ramspoluri
f618f69c6c Added 'since' option to search for messages since a certain time
- Added `since` option under `mailbox` section to search for messages since a certain time instead of going through the complete mailbox during testing scenarios. Acceptable values -`5m|3h|2d|1w`, units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"}). Defaults to `1d` if an incorrect value is provided.
    - Not to mark messages as read if test option is selected (works only for MSGraphConnection)
2024-05-24 20:43:36 +05:30
26 changed files with 7294 additions and 268 deletions

View File

@@ -11,13 +11,26 @@ on:
jobs:
build:
runs-on: ubuntu-latest
services:
elasticsearch:
image: elasticsearch:8.18.2
env:
discovery.type: single-node
cluster.name: parsedmarc-cluster
discovery.seed_hosts: elasticsearch
bootstrap.memory_lock: true
xpack.security.enabled: false
xpack.license.self_generated.type: basic
ports:
- 9200:9200
- 9300:9300
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
@@ -29,13 +42,6 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install -y libemail-outlook-message-perl
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
sudo apt-get install apt-transport-https
echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
sudo apt-get update && sudo apt-get install elasticsearch
sudo sed -i 's/xpack.security.enabled: true/xpack.security.enabled: false/' /etc/elasticsearch/elasticsearch.yml
sudo systemctl restart elasticsearch
sudo systemctl --no-pager status elasticsearch
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip

6
.gitignore vendored
View File

@@ -136,3 +136,9 @@ samples/private
*.html
*.sqlite-journal
parsedmarc.ini
scratch.py
parsedmarc/resources/maps/base_reverse_dns.csv
parsedmarc/resources/maps/unknown_base_reverse_dns.csv

View File

@@ -70,6 +70,7 @@
"modindex",
"msgconvert",
"msgraph",
"MSSP",
"Munge",
"ndjson",
"newkey",
@@ -101,6 +102,7 @@
"sourcetype",
"STARTTLS",
"tasklist",
"timespan",
"tlsa",
"tlsrpt",
"toctree",

View File

@@ -1,6 +1,58 @@
Changelog
=========
8.18.2
------
- Merged PR #603
- Fixes issue #595 - CI test fails for Elasticsearch
- Moved Elasticsearch to a separate Docker service container for CI testing
- Dropped Python 3.8 from CI testing
- Fixes lookup and saving of DMARC forensic reports in Elasticsearch and OpenSearch
- Updated fallback `base_reverse_dns_map.csv`, which now includes over 1,400 lines
- Updated included `dbip-country-lite.mmdb` to the June 2025 release
- Automatically fall back to the internal `base_reverse_dns_map.csv` if the received file is not valid (Fixes #602)
- Print the received data to the debug log
8.18.1
------
- Add missing `https://` to the default Microsoft Graph URL
8.18.0
------
- Add support for Microsoft national clouds via Graph API base URL (PR #590)
- Avoid stopping processing when an invalid DMARC report is encountered (PR #587)
- Increase `http.client._MAXHEADERS` from `100` to `200` to avoid errors connecting to Elasticsearch/OpenSearch (PR #589)
8.17.0
------
- Ignore duplicate aggregate DMARC reports with the same `org_name` and `report_id` seen within the same hour (Fixes #535)
- Fix saving SMTP TLS reports to OpenSearch (PR #585 closed issue #576)
- Add 303 entries to `base_reverse_dns_map.csv`
8.16.1
------
- Failed attempt to ignore aggregate DMARC reports seen within a period of one hour (#535)
8.16.0
------
- Add a `since` option to only search for emails since a certain time (PR #527)
8.15.4
------
- Fix crash if aggregate report timespan is > 24 hours
8.15.3
------
- Ignore aggregate reports with a timespan of > 24 hours (Fixes #282)
8.15.2
------

View File

@@ -42,6 +42,6 @@ Thanks to all
- Consistent data structures
- Simple JSON and/or CSV output
- Optionally email the results
- Optionally send the results to Elasticsearch and/or Splunk, for use
- Optionally send the results to Elasticsearch, Opensearch, and/or Splunk, for use
with premade dashboards
- Optionally send reports to Apache Kafka

View File

@@ -9,12 +9,17 @@ fi
. venv/bin/activate
pip install .[build]
ruff format .
ruff check .
cd docs
make clean
make html
touch build/html/.nojekyll
cp -rf build/html/* ../../parsedmarc-docs/
if [ -d "./../parsedmarc-docs" ]; then
cp -rf build/html/* ../../parsedmarc-docs/
fi
cd ..
sort -o "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt" "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt"
./sortmaps.py
python3 tests.py
rm -rf dist/ build/
hatch build

View File

@@ -28,3 +28,30 @@ services:
interval: 10s
timeout: 10s
retries: 24
opensearch:
image: opensearchproject/opensearch:2.18.0
environment:
- network.host=127.0.0.1
- http.host=0.0.0.0
- node.name=opensearch
- discovery.type=single-node
- cluster.name=parsedmarc-cluster
- discovery.seed_hosts=opensearch
- bootstrap.memory_lock=true
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
ports:
- 127.0.0.1:9201:9200
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s -XGET http://localhost:9201/_cluster/health?pretty | grep status | grep -q '\\(green\\|yellow\\)'"
]
interval: 10s
timeout: 10s
retries: 24

View File

@@ -166,6 +166,9 @@ The full set of configuration options are:
- `check_timeout` - int: Number of seconds to wait for a IMAP
IDLE response or the number of seconds until the next
mail check (Default: `30`)
- `since` - str: Search for messages since certain time. (Examples: `5m|3h|2d|1w`)
Acceptable units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"}).
Defaults to `1d` if incorrect value is provided.
- `imap`
- `host` - str: The IMAP server hostname or IP address
- `port` - int: The IMAP server port (Default: `993`)
@@ -205,6 +208,8 @@ The full set of configuration options are:
- `mailbox` - str: The mailbox name. This defaults to the
current user if using the UsernamePassword auth method, but
could be a shared mailbox if the user has access to the mailbox
- `graph_url` - str: Microsoft Graph URL. Allows for use of National Clouds (ex Azure Gov)
(Default: https://graph.microsoft.com)
- `token_file` - str: Path to save the token file
(Default: `.token`)
- `allow_unencrypted_storage` - bool: Allows the Azure Identity

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -17,7 +17,7 @@ import zlib
from base64 import b64decode
from collections import OrderedDict
from csv import DictWriter
from datetime import datetime
from datetime import datetime, timedelta
from io import BytesIO, StringIO
from typing import Callable
@@ -28,13 +28,18 @@ from lxml import etree
from mailsuite.smtp import send_email
from parsedmarc.log import logger
from parsedmarc.mail import MailboxConnection
from parsedmarc.mail import (
MailboxConnection,
IMAPConnection,
MSGraphConnection,
GmailConnection,
)
from parsedmarc.utils import get_base_domain, get_ip_address_info
from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
from parsedmarc.utils import parse_email
from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime
__version__ = "8.15.2"
__version__ = "8.18.2"
logger.debug("parsedmarc v{0}".format(__version__))
@@ -49,6 +54,7 @@ MAGIC_XML = b"\x3c\x3f\x78\x6d\x6c\x20"
MAGIC_JSON = b"\7b"
IP_ADDRESS_CACHE = ExpiringDict(max_len=10000, max_age_seconds=14400)
SEEN_AGGREGATE_REPORT_IDS = ExpiringDict(max_len=100000000, max_age_seconds=3600)
REVERSE_DNS_MAP = dict()
@@ -266,7 +272,7 @@ def _parse_smtp_tls_failure_details(failure_details):
return new_failure_details
except KeyError as e:
raise InvalidSMTPTLSReport(f"Missing required failure details field:" f" {e}")
raise InvalidSMTPTLSReport(f"Missing required failure details field: {e}")
except Exception as e:
raise InvalidSMTPTLSReport(str(e))
@@ -278,7 +284,7 @@ def _parse_smtp_tls_report_policy(policy):
policy_type = policy["policy"]["policy-type"]
failure_details = []
if policy_type not in policy_types:
raise InvalidSMTPTLSReport(f"Invalid policy type " f"{policy_type}")
raise InvalidSMTPTLSReport(f"Invalid policy type {policy_type}")
new_policy = OrderedDict(policy_domain=policy_domain, policy_type=policy_type)
if "policy-string" in policy["policy"]:
if isinstance(policy["policy"]["policy-string"], list):
@@ -326,9 +332,7 @@ def parse_smtp_tls_report_json(report):
raise Exception(f"Missing required field: {required_field}]")
if not isinstance(report["policies"], list):
policies_type = type(report["policies"])
raise InvalidSMTPTLSReport(
f"policies must be a list, " f"not {policies_type}"
)
raise InvalidSMTPTLSReport(f"policies must be a list, not {policies_type}")
for policy in report["policies"]:
policies.append(_parse_smtp_tls_report_policy(policy))
@@ -519,7 +523,7 @@ def parse_aggregate_report_xml(
date_range = report["report_metadata"]["date_range"]
if int(date_range["end"]) - int(date_range["begin"]) > 2 * 86400:
_error = "Time span > 24 hours - RFC 7489 section 7.2"
errors.append(_error)
raise InvalidAggregateReport(_error)
date_range["begin"] = timestamp_to_human(date_range["begin"])
date_range["end"] = timestamp_to_human(date_range["end"])
new_report_metadata["begin_date"] = date_range["begin"]
@@ -1240,11 +1244,11 @@ def parse_report_email(
field_name = match[0].lower().replace(" ", "-")
fields[field_name] = match[1].strip()
feedback_report = "Arrival-Date: {}\n" "Source-IP: {}" "".format(
feedback_report = "Arrival-Date: {}\nSource-IP: {}".format(
fields["received-date"], fields["sender-ip-address"]
)
except Exception as e:
error = "Unable to parse message with " 'subject "{0}": {1}'.format(
error = 'Unable to parse message with subject "{0}": {1}'.format(
subject, e
)
raise InvalidDMARCReport(error)
@@ -1288,10 +1292,10 @@ def parse_report_email(
"is not a valid "
"aggregate DMARC report: {1}".format(subject, e)
)
raise ParserError(error)
raise InvalidDMARCReport(error)
except Exception as e:
error = "Unable to parse message with " 'subject "{0}": {1}'.format(
error = 'Unable to parse message with subject "{0}": {1}'.format(
subject, e
)
raise ParserError(error)
@@ -1325,7 +1329,7 @@ def parse_report_email(
return result
if result is None:
error = 'Message with subject "{0}" is ' "not a valid report".format(subject)
error = 'Message with subject "{0}" is not a valid report'.format(subject)
raise InvalidDMARCReport(error)
@@ -1465,7 +1469,17 @@ def get_dmarc_reports_from_mbox(
strip_attachment_payloads=sa,
)
if parsed_email["report_type"] == "aggregate":
aggregate_reports.append(parsed_email["report"])
report_org = parsed_email["report"]["report_metadata"]["org_name"]
report_id = parsed_email["report"]["report_metadata"]["report_id"]
report_key = f"{report_org}_{report_id}"
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
aggregate_reports.append(parsed_email["report"])
else:
logger.debug(
"Skipping duplicate aggregate report "
f"from {report_org} with ID: {report_id}"
)
elif parsed_email["report_type"] == "forensic":
forensic_reports.append(parsed_email["report"])
elif parsed_email["report_type"] == "smtp_tls":
@@ -1499,6 +1513,7 @@ def get_dmarc_reports_from_mailbox(
strip_attachment_payloads=False,
results=None,
batch_size=10,
since=None,
create_folders=True,
):
"""
@@ -1522,6 +1537,8 @@ def get_dmarc_reports_from_mailbox(
results (dict): Results from the previous run
batch_size (int): Number of messages to read and process before saving
(use 0 for no limit)
since: Search for messages since certain time
(units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"})
create_folders (bool): Whether to create the destination folders
(not used in watch)
@@ -1534,6 +1551,9 @@ def get_dmarc_reports_from_mailbox(
if connection is None:
raise ValueError("Must supply a connection")
# current_time useful to fetch_messages later in the program
current_time = None
aggregate_reports = []
forensic_reports = []
smtp_tls_reports = []
@@ -1557,11 +1577,50 @@ def get_dmarc_reports_from_mailbox(
connection.create_folder(smtp_tls_reports_folder)
connection.create_folder(invalid_reports_folder)
messages = connection.fetch_messages(reports_folder, batch_size=batch_size)
if since:
_since = 1440 # default one day
if re.match(r"\d+[mhd]$", since):
s = re.split(r"(\d+)", since)
if s[2] == "m":
_since = int(s[1])
elif s[2] == "h":
_since = int(s[1]) * 60
elif s[2] == "d":
_since = int(s[1]) * 60 * 24
elif s[2] == "w":
_since = int(s[1]) * 60 * 24 * 7
else:
logger.warning(
"Incorrect format for 'since' option. \
Provided value:{0}, Expected values:(5m|3h|2d|1w). \
Ignoring option, fetching messages for last 24hrs"
"SMTP does not support a time or timezone in since."
"See https://www.rfc-editor.org/rfc/rfc3501#page-52".format(since)
)
if isinstance(connection, IMAPConnection):
logger.debug(
"Only days and weeks values in 'since' option are \
considered for IMAP conections. Examples: 2d or 1w"
)
since = (datetime.utcnow() - timedelta(minutes=_since)).date()
current_time = datetime.utcnow().date()
elif isinstance(connection, MSGraphConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)).isoformat() + "Z"
current_time = datetime.utcnow().isoformat() + "Z"
elif isinstance(connection, GmailConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)).strftime("%s")
current_time = datetime.utcnow().strftime("%s")
else:
pass
messages = connection.fetch_messages(
reports_folder, batch_size=batch_size, since=since
)
total_messages = len(messages)
logger.debug("Found {0} messages in {1}".format(len(messages), reports_folder))
if batch_size:
if batch_size and not since:
message_limit = min(total_messages, batch_size)
else:
message_limit = total_messages
@@ -1575,7 +1634,13 @@ def get_dmarc_reports_from_mailbox(
i + 1, message_limit, msg_uid
)
)
msg_content = connection.fetch_message(msg_uid)
if isinstance(mailbox, MSGraphConnection):
if test:
msg_content = connection.fetch_message(msg_uid, mark_read=False)
else:
msg_content = connection.fetch_message(msg_uid, mark_read=True)
else:
msg_content = connection.fetch_message(msg_uid)
try:
sa = strip_attachment_payloads
parsed_email = parse_report_email(
@@ -1591,7 +1656,16 @@ def get_dmarc_reports_from_mailbox(
keep_alive=connection.keepalive,
)
if parsed_email["report_type"] == "aggregate":
aggregate_reports.append(parsed_email["report"])
report_org = parsed_email["report"]["report_metadata"]["org_name"]
report_id = parsed_email["report"]["report_metadata"]["report_id"]
report_key = f"{report_org}_{report_id}"
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
aggregate_reports.append(parsed_email["report"])
else:
logger.debug(
f"Skipping duplicate aggregate report with ID: {report_id}"
)
aggregate_report_msg_uids.append(msg_uid)
elif parsed_email["report_type"] == "forensic":
forensic_reports.append(parsed_email["report"])
@@ -1632,7 +1706,7 @@ def get_dmarc_reports_from_mailbox(
except Exception as e:
message = "Error deleting message UID"
e = "{0} {1}: " "{2}".format(message, msg_uid, e)
e = "{0} {1}: {2}".format(message, msg_uid, e)
logger.error("Mailbox error: {0}".format(e))
else:
if len(aggregate_report_msg_uids) > 0:
@@ -1706,7 +1780,12 @@ def get_dmarc_reports_from_mailbox(
]
)
total_messages = len(connection.fetch_messages(reports_folder))
if current_time:
total_messages = len(
connection.fetch_messages(reports_folder, since=current_time)
)
else:
total_messages = len(connection.fetch_messages(reports_folder))
if not test and not batch_size and total_messages > 0:
# Process emails that came in during the last run
@@ -1725,6 +1804,7 @@ def get_dmarc_reports_from_mailbox(
reverse_dns_map_path=reverse_dns_map_path,
reverse_dns_map_url=reverse_dns_map_url,
offline=offline,
since=current_time,
)
return results

View File

@@ -14,6 +14,7 @@ import json
from ssl import CERT_NONE, create_default_context
from multiprocessing import Pipe, Process
import sys
import http.client
from tqdm import tqdm
from parsedmarc import (
@@ -46,6 +47,9 @@ from parsedmarc.mail.graph import AuthMethod
from parsedmarc.log import logger
from parsedmarc.utils import is_mbox, get_reverse_dns
from parsedmarc import SEEN_AGGREGATE_REPORT_IDS
http.client._MAXHEADERS = 200 # pylint:disable=protected-access
formatter = logging.Formatter(
fmt="%(levelname)8s:%(filename)s:%(lineno)d:%(message)s",
@@ -395,7 +399,7 @@ def _main():
arg_parser.add_argument(
"-c",
"--config-file",
help="a path to a configuration file " "(--silent implied)",
help="a path to a configuration file (--silent implied)",
)
arg_parser.add_argument(
"file_path",
@@ -403,7 +407,7 @@ def _main():
help="one or more paths to aggregate or forensic "
"report files, emails, or mbox files'",
)
strip_attachment_help = "remove attachment payloads from forensic " "report output"
strip_attachment_help = "remove attachment payloads from forensic report output"
arg_parser.add_argument(
"--strip-attachment-payloads", help=strip_attachment_help, action="store_true"
)
@@ -446,14 +450,14 @@ def _main():
arg_parser.add_argument(
"-t",
"--dns_timeout",
help="number of seconds to wait for an answer " "from DNS (default: 2.0)",
help="number of seconds to wait for an answer from DNS (default: 2.0)",
type=float,
default=2.0,
)
arg_parser.add_argument(
"--offline",
action="store_true",
help="do not make online queries for geolocation " " or DNS",
help="do not make online queries for geolocation or DNS",
)
arg_parser.add_argument(
"-s", "--silent", action="store_true", help="only print errors"
@@ -510,6 +514,7 @@ def _main():
mailbox_test=False,
mailbox_batch_size=10,
mailbox_check_timeout=30,
mailbox_since=None,
imap_host=None,
imap_skip_certificate_verification=False,
imap_ssl=True,
@@ -526,6 +531,7 @@ def _main():
graph_tenant_id=None,
graph_mailbox=None,
graph_allow_unencrypted_storage=False,
graph_url="https://graph.microsoft.com",
hec=None,
hec_token=None,
hec_index=None,
@@ -714,6 +720,8 @@ def _main():
opts.mailbox_batch_size = mailbox_config.getint("batch_size")
if "check_timeout" in mailbox_config:
opts.mailbox_check_timeout = mailbox_config.getint("check_timeout")
if "since" in mailbox_config:
opts.mailbox_since = mailbox_config["since"]
if "imap" in config.sections():
imap_config = config["imap"]
@@ -726,7 +734,7 @@ def _main():
if "host" in imap_config:
opts.imap_host = imap_config["host"]
else:
logger.error("host setting missing from the " "imap config section")
logger.error("host setting missing from the imap config section")
exit(-1)
if "port" in imap_config:
opts.imap_port = imap_config.getint("port")
@@ -742,14 +750,12 @@ def _main():
if "user" in imap_config:
opts.imap_user = imap_config["user"]
else:
logger.critical("user setting missing from the " "imap config section")
logger.critical("user setting missing from the imap config section")
exit(-1)
if "password" in imap_config:
opts.imap_password = imap_config["password"]
else:
logger.critical(
"password setting missing from the " "imap config section"
)
logger.critical("password setting missing from the imap config section")
exit(-1)
if "reports_folder" in imap_config:
opts.mailbox_reports_folder = imap_config["reports_folder"]
@@ -818,21 +824,20 @@ def _main():
opts.graph_user = graph_config["user"]
else:
logger.critical(
"user setting missing from the " "msgraph config section"
"user setting missing from the msgraph config section"
)
exit(-1)
if "password" in graph_config:
opts.graph_password = graph_config["password"]
else:
logger.critical(
"password setting missing from the " "msgraph config section"
"password setting missing from the msgraph config section"
)
if "client_secret" in graph_config:
opts.graph_client_secret = graph_config["client_secret"]
else:
logger.critical(
"client_secret setting missing from the "
"msgraph config section"
"client_secret setting missing from the msgraph config section"
)
exit(-1)
@@ -845,7 +850,7 @@ def _main():
opts.graph_tenant_id = graph_config["tenant_id"]
else:
logger.critical(
"tenant_id setting missing from the " "msgraph config section"
"tenant_id setting missing from the msgraph config section"
)
exit(-1)
@@ -854,8 +859,7 @@ def _main():
opts.graph_client_secret = graph_config["client_secret"]
else:
logger.critical(
"client_secret setting missing from the "
"msgraph config section"
"client_secret setting missing from the msgraph config section"
)
exit(-1)
@@ -863,7 +867,7 @@ def _main():
opts.graph_client_id = graph_config["client_id"]
else:
logger.critical(
"client_id setting missing from the " "msgraph config section"
"client_id setting missing from the msgraph config section"
)
exit(-1)
@@ -871,10 +875,13 @@ def _main():
opts.graph_mailbox = graph_config["mailbox"]
elif opts.graph_auth_method != AuthMethod.UsernamePassword.name:
logger.critical(
"mailbox setting missing from the " "msgraph config section"
"mailbox setting missing from the msgraph config section"
)
exit(-1)
if "graph_url" in graph_config:
opts.graph_url = graph_config["graph_url"]
if "allow_unencrypted_storage" in graph_config:
opts.graph_allow_unencrypted_storage = graph_config.getboolean(
"allow_unencrypted_storage"
@@ -886,7 +893,7 @@ def _main():
opts.elasticsearch_hosts = _str_to_list(elasticsearch_config["hosts"])
else:
logger.critical(
"hosts setting missing from the " "elasticsearch config section"
"hosts setting missing from the elasticsearch config section"
)
exit(-1)
if "timeout" in elasticsearch_config:
@@ -924,7 +931,7 @@ def _main():
opts.opensearch_hosts = _str_to_list(opensearch_config["hosts"])
else:
logger.critical(
"hosts setting missing from the " "opensearch config section"
"hosts setting missing from the opensearch config section"
)
exit(-1)
if "timeout" in opensearch_config:
@@ -960,21 +967,21 @@ def _main():
opts.hec = hec_config["url"]
else:
logger.critical(
"url setting missing from the " "splunk_hec config section"
"url setting missing from the splunk_hec config section"
)
exit(-1)
if "token" in hec_config:
opts.hec_token = hec_config["token"]
else:
logger.critical(
"token setting missing from the " "splunk_hec config section"
"token setting missing from the splunk_hec config section"
)
exit(-1)
if "index" in hec_config:
opts.hec_index = hec_config["index"]
else:
logger.critical(
"index setting missing from the " "splunk_hec config section"
"index setting missing from the splunk_hec config section"
)
exit(-1)
if "skip_certificate_verification" in hec_config:
@@ -987,9 +994,7 @@ def _main():
if "hosts" in kafka_config:
opts.kafka_hosts = _str_to_list(kafka_config["hosts"])
else:
logger.critical(
"hosts setting missing from the " "kafka config section"
)
logger.critical("hosts setting missing from the kafka config section")
exit(-1)
if "user" in kafka_config:
opts.kafka_username = kafka_config["user"]
@@ -1004,21 +1009,20 @@ def _main():
opts.kafka_aggregate_topic = kafka_config["aggregate_topic"]
else:
logger.critical(
"aggregate_topic setting missing from the " "kafka config section"
"aggregate_topic setting missing from the kafka config section"
)
exit(-1)
if "forensic_topic" in kafka_config:
opts.kafka_forensic_topic = kafka_config["forensic_topic"]
else:
logger.critical(
"forensic_topic setting missing from the " "kafka config section"
"forensic_topic setting missing from the kafka config section"
)
if "smtp_tls_topic" in kafka_config:
opts.kafka_smtp_tls_topic = kafka_config["smtp_tls_topic"]
else:
logger.critical(
"forensic_topic setting missing from the "
"splunk_hec config section"
"forensic_topic setting missing from the splunk_hec config section"
)
if "smtp" in config.sections():
@@ -1026,7 +1030,7 @@ def _main():
if "host" in smtp_config:
opts.smtp_host = smtp_config["host"]
else:
logger.critical("host setting missing from the " "smtp config section")
logger.critical("host setting missing from the smtp config section")
exit(-1)
if "port" in smtp_config:
opts.smtp_port = smtp_config.getint("port")
@@ -1038,23 +1042,21 @@ def _main():
if "user" in smtp_config:
opts.smtp_user = smtp_config["user"]
else:
logger.critical("user setting missing from the " "smtp config section")
logger.critical("user setting missing from the smtp config section")
exit(-1)
if "password" in smtp_config:
opts.smtp_password = smtp_config["password"]
else:
logger.critical(
"password setting missing from the " "smtp config section"
)
logger.critical("password setting missing from the smtp config section")
exit(-1)
if "from" in smtp_config:
opts.smtp_from = smtp_config["from"]
else:
logger.critical("from setting missing from the " "smtp config section")
logger.critical("from setting missing from the smtp config section")
if "to" in smtp_config:
opts.smtp_to = _str_to_list(smtp_config["to"])
else:
logger.critical("to setting missing from the " "smtp config section")
logger.critical("to setting missing from the smtp config section")
if "subject" in smtp_config:
opts.smtp_subject = smtp_config["subject"]
if "attachment" in smtp_config:
@@ -1067,7 +1069,7 @@ def _main():
if "bucket" in s3_config:
opts.s3_bucket = s3_config["bucket"]
else:
logger.critical("bucket setting missing from the " "s3 config section")
logger.critical("bucket setting missing from the s3 config section")
exit(-1)
if "path" in s3_config:
opts.s3_path = s3_config["path"]
@@ -1092,9 +1094,7 @@ def _main():
if "server" in syslog_config:
opts.syslog_server = syslog_config["server"]
else:
logger.critical(
"server setting missing from the " "syslog config section"
)
logger.critical("server setting missing from the syslog config section")
exit(-1)
if "port" in syslog_config:
opts.syslog_port = syslog_config["port"]
@@ -1145,17 +1145,17 @@ def _main():
if "host" in gelf_config:
opts.gelf_host = gelf_config["host"]
else:
logger.critical("host setting missing from the " "gelf config section")
logger.critical("host setting missing from the gelf config section")
exit(-1)
if "port" in gelf_config:
opts.gelf_port = gelf_config["port"]
else:
logger.critical("port setting missing from the " "gelf config section")
logger.critical("port setting missing from the gelf config section")
exit(-1)
if "mode" in gelf_config:
opts.gelf_mode = gelf_config["mode"]
else:
logger.critical("mode setting missing from the " "gelf config section")
logger.critical("mode setting missing from the gelf config section")
exit(-1)
if "webhook" in config.sections():
@@ -1181,8 +1181,7 @@ def _main():
try:
fh = logging.FileHandler(opts.log_file, "a")
formatter = logging.Formatter(
"%(asctime)s - "
"%(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
"%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
)
fh.setFormatter(formatter)
logger.addHandler(fh)
@@ -1290,7 +1289,7 @@ def _main():
if opts.hec:
if opts.hec_token is None or opts.hec_index is None:
logger.error("HEC token and HEC index are required when " "using HEC URL")
logger.error("HEC token and HEC index are required when using HEC URL")
exit(1)
verify = True
@@ -1415,7 +1414,17 @@ def _main():
logger.error("Failed to parse {0} - {1}".format(result[1], result[0]))
else:
if result[0]["report_type"] == "aggregate":
aggregate_reports.append(result[0]["report"])
report_org = result[0]["report"]["report_metadata"]["org_name"]
report_id = result[0]["report"]["report_metadata"]["report_id"]
report_key = f"{report_org}_{report_id}"
if report_key not in SEEN_AGGREGATE_REPORT_IDS:
SEEN_AGGREGATE_REPORT_IDS[report_key] = True
aggregate_reports.append(result[0]["report"])
else:
logger.debug(
"Skipping duplicate aggregate report "
f"from {report_org} with ID: {report_id}"
)
elif result[0]["report_type"] == "forensic":
forensic_reports.append(result[0]["report"])
elif result[0]["report_type"] == "smtp_tls":
@@ -1443,7 +1452,7 @@ def _main():
try:
if opts.imap_user is None or opts.imap_password is None:
logger.error(
"IMAP user and password must be specified if" "host is specified"
"IMAP user and password must be specified ifhost is specified"
)
ssl = True
@@ -1482,6 +1491,7 @@ def _main():
password=opts.graph_password,
token_file=opts.graph_token_file,
allow_unencrypted_storage=opts.graph_allow_unencrypted_storage,
graph_url=opts.graph_url,
)
except Exception:
@@ -1540,6 +1550,7 @@ def _main():
nameservers=opts.nameservers,
test=opts.mailbox_test,
strip_attachment_payloads=opts.strip_attachment_payloads,
since=opts.mailbox_since,
)
aggregate_reports += reports["aggregate_reports"]

View File

@@ -552,8 +552,8 @@ def save_forensic_report_to_elasticsearch(
for original_header in original_headers:
headers[original_header.lower()] = original_headers[original_header]
arrival_date_human = forensic_report["arrival_date_utc"]
arrival_date = human_timestamp_to_datetime(arrival_date_human)
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
if index_suffix is not None:
search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -562,20 +562,35 @@ def save_forensic_report_to_elasticsearch(
if index_prefix is not None:
search_index = "{0}{1}".format(index_prefix, search_index)
search = Search(index=search_index)
arrival_query = {"match": {"arrival_date": arrival_date}}
q = Q(arrival_query)
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
from_ = None
to_ = None
subject = None
if "from" in headers:
from_ = headers["from"]
from_query = {"match_phrase": {"sample.headers.from": from_}}
q = q & Q(from_query)
# We convert the FROM header from a string list to a flat string.
headers["from"] = headers["from"][0]
if headers["from"][0] == "":
headers["from"] = headers["from"][1]
else:
headers["from"] = " <".join(headers["from"]) + ">"
from_ = dict()
from_["sample.headers.from"] = headers["from"]
from_query = Q(dict(match_phrase=from_))
q = q & from_query
if "to" in headers:
to_ = headers["to"]
to_query = {"match_phrase": {"sample.headers.to": to_}}
q = q & Q(to_query)
# We convert the TO header from a string list to a flat string.
headers["to"] = headers["to"][0]
if headers["to"][0] == "":
headers["to"] = headers["to"][1]
else:
headers["to"] = " <".join(headers["to"]) + ">"
to_ = dict()
to_["sample.headers.to"] = headers["to"]
to_query = Q(dict(match_phrase=to_))
q = q & to_query
if "subject" in headers:
subject = headers["subject"]
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -589,7 +604,9 @@ def save_forensic_report_to_elasticsearch(
"A forensic sample to {0} from {1} "
"with a subject of {2} and arrival date of {3} "
"already exists in "
"Elasticsearch".format(to_, from_, subject, arrival_date_human)
"Elasticsearch".format(
to_, from_, subject, forensic_report["arrival_date_utc"]
)
)
parsed_sample = forensic_report["parsed_sample"]
@@ -625,7 +642,7 @@ def save_forensic_report_to_elasticsearch(
user_agent=forensic_report["user_agent"],
version=forensic_report["version"],
original_mail_from=forensic_report["original_mail_from"],
arrival_date=arrival_date,
arrival_date=arrival_date_epoch_milliseconds,
domain=forensic_report["reported_domain"],
original_envelope_id=forensic_report["original_envelope_id"],
authentication_results=forensic_report["authentication_results"],

View File

@@ -63,24 +63,36 @@ class GmailConnection(MailboxConnection):
).execute()
except HttpError as e:
if e.status_code == 409:
logger.debug(
f"Folder {folder_name} already exists, " f"skipping creation"
)
logger.debug(f"Folder {folder_name} already exists, skipping creation")
else:
raise e
def _fetch_all_message_ids(self, reports_label_id, page_token=None):
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
def _fetch_all_message_ids(self, reports_label_id, page_token=None, since=None):
if since:
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
q=f"after:{since}",
)
.execute()
)
else:
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
)
.execute()
)
.execute()
)
messages = results.get("messages", [])
for message in messages:
yield message["id"]
@@ -92,7 +104,13 @@ class GmailConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs) -> List[str]:
reports_label_id = self._find_label_id_for_label(reports_folder)
return [id for id in self._fetch_all_message_ids(reports_label_id)]
since = kwargs.get("since")
if since:
return [
id for id in self._fetch_all_message_ids(reports_label_id, since=since)
]
else:
return [id for id in self._fetch_all_message_ids(reports_label_id)]
def fetch_message(self, message_id):
msg = (

View File

@@ -89,6 +89,7 @@ class MSGraphConnection(MailboxConnection):
self,
auth_method: str,
mailbox: str,
graph_url: str,
client_id: str,
client_secret: str,
username: str,
@@ -108,7 +109,10 @@ class MSGraphConnection(MailboxConnection):
token_path=token_path,
allow_unencrypted_storage=allow_unencrypted_storage,
)
client_params = {"credential": credential}
client_params = {
"credential": credential,
"cloud": graph_url,
}
if not isinstance(credential, ClientSecretCredential):
scopes = ["Mail.ReadWrite"]
# Detect if mailbox is shared
@@ -137,25 +141,30 @@ class MSGraphConnection(MailboxConnection):
request_url = f"/users/{self.mailbox_name}/mailFolders{sub_url}"
resp = self._client.post(request_url, json=request_body)
if resp.status_code == 409:
logger.debug(f"Folder {folder_name} already exists, " f"skipping creation")
logger.debug(f"Folder {folder_name} already exists, skipping creation")
elif resp.status_code == 201:
logger.debug(f"Created folder {folder_name}")
else:
logger.warning(f"Unknown response " f"{resp.status_code} {resp.json()}")
logger.warning(f"Unknown response {resp.status_code} {resp.json()}")
def fetch_messages(self, folder_name: str, **kwargs) -> List[str]:
"""Returns a list of message UIDs in the specified folder"""
folder_id = self._find_folder_id_from_folder_path(folder_name)
url = f"/users/{self.mailbox_name}/mailFolders/" f"{folder_id}/messages"
url = f"/users/{self.mailbox_name}/mailFolders/{folder_id}/messages"
since = kwargs.get("since")
if not since:
since = None
batch_size = kwargs.get("batch_size")
if not batch_size:
batch_size = 0
emails = self._get_all_messages(url, batch_size)
emails = self._get_all_messages(url, batch_size, since)
return [email["id"] for email in emails]
def _get_all_messages(self, url, batch_size):
def _get_all_messages(self, url, batch_size, since):
messages: list
params = {"$select": "id"}
if since:
params["$filter"] = f"receivedDateTime ge {since}"
if batch_size and batch_size > 0:
params["$top"] = batch_size
else:
@@ -166,7 +175,7 @@ class MSGraphConnection(MailboxConnection):
messages = result.json()["value"]
# Loop if next page is present and not obtained message limit.
while "@odata.nextLink" in result.json() and (
batch_size == 0 or batch_size - len(messages) > 0
since is not None or (batch_size == 0 or batch_size - len(messages) > 0)
):
result = self._client.get(result.json()["@odata.nextLink"])
if result.status_code != 200:
@@ -180,17 +189,19 @@ class MSGraphConnection(MailboxConnection):
resp = self._client.patch(url, json={"isRead": "true"})
if resp.status_code != 200:
raise RuntimeWarning(
f"Failed to mark message read" f"{resp.status_code}: {resp.json()}"
f"Failed to mark message read{resp.status_code}: {resp.json()}"
)
def fetch_message(self, message_id: str):
def fetch_message(self, message_id: str, **kwargs):
url = f"/users/{self.mailbox_name}/messages/{message_id}/$value"
result = self._client.get(url)
if result.status_code != 200:
raise RuntimeWarning(
f"Failed to fetch message" f"{result.status_code}: {result.json()}"
f"Failed to fetch message{result.status_code}: {result.json()}"
)
self.mark_message_read(message_id)
mark_read = kwargs.get("mark_read")
if mark_read:
self.mark_message_read(message_id)
return result.text
def delete_message(self, message_id: str):
@@ -198,7 +209,7 @@ class MSGraphConnection(MailboxConnection):
resp = self._client.delete(url)
if resp.status_code != 204:
raise RuntimeWarning(
f"Failed to delete message " f"{resp.status_code}: {resp.json()}"
f"Failed to delete message {resp.status_code}: {resp.json()}"
)
def move_message(self, message_id: str, folder_name: str):
@@ -208,7 +219,7 @@ class MSGraphConnection(MailboxConnection):
resp = self._client.post(url, json=request_body)
if resp.status_code != 201:
raise RuntimeWarning(
f"Failed to move message " f"{resp.status_code}: {resp.json()}"
f"Failed to move message {resp.status_code}: {resp.json()}"
)
def keepalive(self):
@@ -243,7 +254,7 @@ class MSGraphConnection(MailboxConnection):
filter = f"?$filter=displayName eq '{folder_name}'"
folders_resp = self._client.get(url + filter)
if folders_resp.status_code != 200:
raise RuntimeWarning(f"Failed to list folders." f"{folders_resp.json()}")
raise RuntimeWarning(f"Failed to list folders.{folders_resp.json()}")
folders: list = folders_resp.json()["value"]
matched_folders = [
folder for folder in folders if folder["displayName"] == folder_name

View File

@@ -39,7 +39,11 @@ class IMAPConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs):
self._client.select_folder(reports_folder)
return self._client.search()
since = kwargs.get("since")
if since:
return self._client.search(["SINCE", since])
else:
return self._client.search()
def fetch_message(self, message_id):
return self._client.fetch_message(message_id, parse=False)
@@ -81,7 +85,5 @@ class IMAPConnection(MailboxConnection):
logger.warning("IMAP connection timeout. Reconnecting...")
sleep(check_timeout)
except Exception as e:
logger.warning(
"IMAP connection error. {0}. " "Reconnecting...".format(e)
)
logger.warning("IMAP connection error. {0}. Reconnecting...".format(e))
sleep(check_timeout)

View File

@@ -202,13 +202,15 @@ class _SMTPTLSPolicyDoc(InnerDoc):
receiving_ip,
receiving_mx_helo,
failed_session_count,
sending_mta_ip=None,
receiving_mx_hostname=None,
additional_information_uri=None,
failure_reason_code=None,
):
self.failure_details.append(
_details = _SMTPTLSFailureDetailsDoc(
result_type=result_type,
ip_address=ip_address,
sending_mta_ip=sending_mta_ip,
receiving_mx_hostname=receiving_mx_hostname,
receiving_mx_helo=receiving_mx_helo,
receiving_ip=receiving_ip,
@@ -216,9 +218,10 @@ class _SMTPTLSPolicyDoc(InnerDoc):
additional_information=additional_information_uri,
failure_reason_code=failure_reason_code,
)
self.failure_details.append(_details)
class _SMTPTLSFailureReportDoc(Document):
class _SMTPTLSReportDoc(Document):
class Index:
name = "smtp_tls"
@@ -499,6 +502,7 @@ def save_aggregate_report_to_opensearch(
index = "{0}_{1}".format(index, index_suffix)
if index_prefix:
index = "{0}{1}".format(index_prefix, index)
index = "{0}-{1}".format(index, index_date)
index_settings = dict(
number_of_shards=number_of_shards, number_of_replicas=number_of_replicas
@@ -548,8 +552,8 @@ def save_forensic_report_to_opensearch(
for original_header in original_headers:
headers[original_header.lower()] = original_headers[original_header]
arrival_date_human = forensic_report["arrival_date_utc"]
arrival_date = human_timestamp_to_datetime(arrival_date_human)
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
if index_suffix is not None:
search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -558,20 +562,35 @@ def save_forensic_report_to_opensearch(
if index_prefix is not None:
search_index = "{0}{1}".format(index_prefix, search_index)
search = Search(index=search_index)
arrival_query = {"match": {"arrival_date": arrival_date}}
q = Q(arrival_query)
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
from_ = None
to_ = None
subject = None
if "from" in headers:
from_ = headers["from"]
from_query = {"match_phrase": {"sample.headers.from": from_}}
q = q & Q(from_query)
# We convert the FROM header from a string list to a flat string.
headers["from"] = headers["from"][0]
if headers["from"][0] == "":
headers["from"] = headers["from"][1]
else:
headers["from"] = " <".join(headers["from"]) + ">"
from_ = dict()
from_["sample.headers.from"] = headers["from"]
from_query = Q(dict(match_phrase=from_))
q = q & from_query
if "to" in headers:
to_ = headers["to"]
to_query = {"match_phrase": {"sample.headers.to": to_}}
q = q & Q(to_query)
# We convert the TO header from a string list to a flat string.
headers["to"] = headers["to"][0]
if headers["to"][0] == "":
headers["to"] = headers["to"][1]
else:
headers["to"] = " <".join(headers["to"]) + ">"
to_ = dict()
to_["sample.headers.to"] = headers["to"]
to_query = Q(dict(match_phrase=to_))
q = q & to_query
if "subject" in headers:
subject = headers["subject"]
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -585,7 +604,9 @@ def save_forensic_report_to_opensearch(
"A forensic sample to {0} from {1} "
"with a subject of {2} and arrival date of {3} "
"already exists in "
"OpenSearch".format(to_, from_, subject, arrival_date_human)
"OpenSearch".format(
to_, from_, subject, forensic_report["arrival_date_utc"]
)
)
parsed_sample = forensic_report["parsed_sample"]
@@ -621,7 +642,7 @@ def save_forensic_report_to_opensearch(
user_agent=forensic_report["user_agent"],
version=forensic_report["version"],
original_mail_from=forensic_report["original_mail_from"],
arrival_date=arrival_date,
arrival_date=arrival_date_epoch_milliseconds,
domain=forensic_report["reported_domain"],
original_envelope_id=forensic_report["original_envelope_id"],
authentication_results=forensic_report["authentication_results"],
@@ -685,7 +706,7 @@ def save_smtp_tls_report_to_opensearch(
AlreadySaved
"""
logger.info("Saving aggregate report to OpenSearch")
org_name = report["org_name"]
org_name = report["organization_name"]
report_id = report["report_id"]
begin_date = human_timestamp_to_datetime(report["begin_date"], to_utc=True)
end_date = human_timestamp_to_datetime(report["end_date"], to_utc=True)
@@ -741,11 +762,11 @@ def save_smtp_tls_report_to_opensearch(
number_of_shards=number_of_shards, number_of_replicas=number_of_replicas
)
smtp_tls_doc = _SMTPTLSFailureReportDoc(
organization_name=report["organization_name"],
date_range=[report["date_begin"], report["date_end"]],
date_begin=report["date_begin"],
date_end=report["date_end"],
smtp_tls_doc = _SMTPTLSReportDoc(
org_name=report["organization_name"],
date_range=[report["begin_date"], report["end_date"]],
date_begin=report["begin_date"],
date_end=report["end_date"],
contact_info=report["contact_info"],
report_id=report["report_id"],
)
@@ -760,32 +781,48 @@ def save_smtp_tls_report_to_opensearch(
policy_doc = _SMTPTLSPolicyDoc(
policy_domain=policy["policy_domain"],
policy_type=policy["policy_type"],
succesful_session_count=policy["successful_session_count"],
failed_session_count=policy["failed_session_count"],
policy_string=policy_strings,
mx_host_patterns=mx_host_patterns,
)
if "failure_details" in policy:
failure_details = policy["failure_details"]
receiving_mx_hostname = None
additional_information_uri = None
failure_reason_code = None
if "receiving_mx_hostname" in failure_details:
receiving_mx_hostname = failure_details["receiving_mx_hostname"]
if "additional_information_uri" in failure_details:
additional_information_uri = failure_details[
"additional_information_uri"
]
if "failure_reason_code" in failure_details:
failure_reason_code = failure_details["failure_reason_code"]
policy_doc.add_failure_details(
result_type=failure_details["result_type"],
ip_address=failure_details["ip_address"],
receiving_ip=failure_details["receiving_ip"],
receiving_mx_helo=failure_details["receiving_mx_helo"],
failed_session_count=failure_details["failed_session_count"],
receiving_mx_hostname=receiving_mx_hostname,
additional_information_uri=additional_information_uri,
failure_reason_code=failure_reason_code,
)
for failure_detail in policy["failure_details"]:
receiving_mx_hostname = None
additional_information_uri = None
failure_reason_code = None
ip_address = None
receiving_ip = None
receiving_mx_helo = None
sending_mta_ip = None
if "receiving_mx_hostname" in failure_detail:
receiving_mx_hostname = failure_detail["receiving_mx_hostname"]
if "additional_information_uri" in failure_detail:
additional_information_uri = failure_detail[
"additional_information_uri"
]
if "failure_reason_code" in failure_detail:
failure_reason_code = failure_detail["failure_reason_code"]
if "ip_address" in failure_detail:
ip_address = failure_detail["ip_address"]
if "receiving_ip" in failure_detail:
receiving_ip = failure_detail["receiving_ip"]
if "receiving_mx_helo" in failure_detail:
receiving_mx_helo = failure_detail["receiving_mx_helo"]
if "sending_mta_ip" in failure_detail:
sending_mta_ip = failure_detail["sending_mta_ip"]
policy_doc.add_failure_details(
result_type=failure_detail["result_type"],
ip_address=ip_address,
receiving_ip=receiving_ip,
receiving_mx_helo=receiving_mx_helo,
failed_session_count=failure_detail["failed_session_count"],
sending_mta_ip=sending_mta_ip,
receiving_mx_hostname=receiving_mx_hostname,
additional_information_uri=additional_information_uri,
failure_reason_code=failure_reason_code,
)
smtp_tls_doc.policies.append(policy_doc)
create_indexes([index], index_settings)

View File

@@ -1,7 +1,7 @@
# About
`dbip-country-lite.mmdb` is provided by [dbip][dbip] under a
[ Creative Commons Attribution 4.0 International License][cc].
[Creative Commons Attribution 4.0 International License][cc].
[dbip]: https://db-ip.com/db/lite.php
[dbip]: https://db-ip.com/db/download/ip-to-country-lite
[cc]: http://creativecommons.org/licenses/by/4.0/

View File

@@ -19,33 +19,65 @@ The `service_type` is based on the following rule precedence:
3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting.
4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting.
5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted
6. All legitimate platforms offering their Software as a Service SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry
- Agriculture
- Automotive
- Beauty
- Construction
- Consulting
- Defense
- Education
- Email Provider
- Email Security
- Education
- Entertainment
- Event Planning
- Finance
- Food
- Government
- Government Media
- Healthcare
- IaaS
- Industrial
- ISP
- Logistics
- Manufacturing
- Marketing
- MSP
- MSSP
- News
- Nonprofit
- PaaS
- Photography
- Print
- Publishing
- Real Estate
- Retail
- SaaS
- Science
- Search Engine
- Social Media
- Sports
- Staffing
- Technology
- Travel
- Web Host
The file currently contains over 600 mappings from a wide variety of email sending services, including large email
providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to
include many other services and industries.
The file currently contains over 1,400 mappings from a wide variety of email sending sources.
## known_unknown_base_reverse_dns.txt
A list of reverse DNS base domains that could not be identified as belonging to a particular organization, service, or industry.
## base_reverse_dns.csv
A CSV with the fields `source_name` and optionally `message_countcount`. This CSV can be generated byy exporting the base DNS data from the Kibana on Splunk dashboards provided by parsedmarc. This file is not tracked by Git.
## unknown_base_reverse_dns.csv
A CSV file with the fields `source_name` and `message_count`. This file is not tracked by Git.
## find_unknown_base_reverse_dns.py
This is a python script that reads the domains in `base_reverse_dns.csv` and writes the domains that are not in `base_reverse_dns_map.csv` or `known_unknown_base_reverse_dns.txt` to `unknown_base_reverse_dns.csv`. This is useful for identifying potential additional domains to contribute to `base_reverse_dns_map.csv` and `known_unknown_base_reverse_dns.txt`.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python
import logging
import os
import csv
def _main():
input_csv_file_path = "base_reverse_dns.csv"
base_reverse_dns_map_file_path = "base_reverse_dns_map.csv"
known_unknown_list_file_path = "known_unknown_base_reverse_dns.txt"
output_csv_file_path = "unknown_base_reverse_dns.csv"
csv_headers = ["source_name", "message_count"]
output_rows = []
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
for p in [
input_csv_file_path,
base_reverse_dns_map_file_path,
known_unknown_list_file_path,
]:
if not os.path.exists(p):
logger.error(f"{p} does not exist")
exit(1)
logger.info(f"Loading {known_unknown_list_file_path}")
known_unknown_domains = []
with open(known_unknown_list_file_path) as f:
for line in f.readlines():
domain = line.lower().strip()
if domain in known_unknown_domains:
logger.warning(
f"{domain} is in {known_unknown_list_file_path} multiple times"
)
else:
known_unknown_domains.append(domain)
logger.info(f"Loading {base_reverse_dns_map_file_path}")
known_domains = []
with open(base_reverse_dns_map_file_path) as f:
for row in csv.DictReader(f):
domain = row["base_reverse_dns"].lower().strip()
if domain in known_domains:
logger.warning(
f"{domain} is in {base_reverse_dns_map_file_path} multiple times"
)
else:
known_domains.append(domain)
if domain in known_unknown_domains and known_domains:
pass
logger.warning(
f"{domain} is in {known_unknown_list_file_path} and {base_reverse_dns_map_file_path}"
)
logger.info(f"Checking domains against {base_reverse_dns_map_file_path}")
with open(input_csv_file_path) as f:
for row in csv.DictReader(f):
domain = row["source_name"].lower().strip()
if domain not in known_domains and domain not in known_unknown_domains:
logger.info(f"New unknown domain found: {domain}")
output_rows.append(row)
logger.info(f"Writing {output_csv_file_path}")
with open(output_csv_file_path, "w") as f:
writer = csv.DictWriter(f, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(output_rows)
if __name__ == "__main__":
_main()

View File

@@ -0,0 +1,125 @@
200.in-addr.arpa
adlucrumnewsletter.com
admin.corpivensa.gob.ve
aerospacevitro.us.com
albagroup-eg.com
anteldata.net.uy
antonaoll.com
aosau.net
arandomserver.com
asmecam.it
b8sales.com
bestinvestingtime.com
biocorp.com
bisno1.co.jp
bluhosting.com
bodiax.pp.ua
bost-law.com
brnonet.cz
brushinglegal.de
christus.mx
cloud-edm.com
cloudlogin.co
cnode.io
commerceinsurance.com
coolblaze.com
cps.com.ar
detrot.xyz
digi.net.my
dkginternet.com
doorsrv.com
dreamtechmedia.com
ds.network
emailperegrine.com
epsilon-group.com
eyecandyhosting.xyz
fetscorp.shop
formicidaehunt.net
fosterheap.com
gendns.com
ginous.eu.com
gist-th.com
gophermedia.com
gqlists.us.com
gratzl.de
hgnbroken.us.com
hosting1337.com
hostingmichigan.com
hostname.localhost
hostnetwork.com
hostwhitelabel.com
idcfcloud.net
immenzaces.com
ivol.co
jalanet.co.id
kahlaa.com
kbronet.com.tw
kdnursing.org
kitchenaildbd.com
legenditds.com
lighthouse-media.com
lohkal.com
lonestarmm.net
magnetmail.net
manhattanbulletpoint.com
masterclassjournal.com
moderntradingnews.com
moonjaws.com
motion4ever.net
mschosting.com
mspnet.pro
mts-nn.ru
mxthunder.net
myrewards.net
mysagestore.com
ncport.ru
nebdig.com
neovet-base.ru
nic.name
nidix.net
ogicom.net
omegabrasil.inf.br
onnet21.com
ovaltinalization.co
overta.ru
passionatesmiles.com
planethoster.net
pmnhost.net
popiup.com
prima.com.ar
prima.net.ar
proudserver.com
qontenciplc.autos
raxa.host
sahacker-2020.com
samsales.site
satirogluet.com
securednshost.com
securen.net
securerelay.in
securev.net
servershost.biz
smallvillages.com
solusoftware.com
spiritualtechnologies.io
sprout.org
stableserver.net
stockexchangejournal.com
suksangroup.com
system.eu.com
t-jon.com
tenkids.net
thaicloudsolutions.com
thaimonster.com
tullostrucking.com
unite.services
urawasl.com
us.servername.us
vendimetry.com
vibrantwellnesscorp.com
wallstreetsgossip.com
weblinkinternational.com
xsfati.us.com
xspmail.jp
zerowebhosting.net
znlc.jp

View File

@@ -19,10 +19,11 @@ import csv
import io
try:
import importlib.resources as pkg_resources
from importlib.resources import files
except ImportError:
# Try backported to PY<37 `importlib_resources`
import importlib_resources as pkg_resources
# Try backported to PY<3 `importlib_resources`
from importlib.resources import files
from dateutil.parser import parse as parse_date
import dns.reversename
@@ -280,14 +281,13 @@ def get_ip_address_country(ip_address, db_path=None):
break
if db_path is None:
with pkg_resources.path(
parsedmarc.resources.dbip, "dbip-country-lite.mmdb"
) as path:
db_path = path
db_path = str(
files(parsedmarc.resources.dbip).joinpath("dbip-country-lite.mmdb")
)
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
if db_age > timedelta(days=30):
logger.warning("IP database is more than a month old")
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
if db_age > timedelta(days=30):
logger.warning("IP database is more than a month old")
db_reader = geoip2.database.Reader(db_path)
@@ -344,21 +344,28 @@ def get_service_from_reverse_dns_base_domain(
if not (offline or always_use_local_file) and len(reverse_dns_map) == 0:
try:
logger.debug(f"Trying to fetch " f"reverse DNS map from {url}...")
csv_file.write(requests.get(url).text)
logger.debug(f"Trying to fetch reverse DNS map from {url}...")
response = requests.get(url)
response.raise_for_status()
csv_file.write(response.text)
csv_file.seek(0)
load_csv(csv_file)
except requests.exceptions.RequestException as e:
logger.warning(f"Failed to fetch reverse DNS map: {e}")
except Exception:
logger.warning("Not a valid CSV file")
csv_file.seek(0)
logger.debug(csv_file.read())
if len(reverse_dns_map) == 0:
logger.info("Loading included reverse DNS map...")
with pkg_resources.path(
parsedmarc.resources.maps, "base_reverse_dns_map.csv"
) as path:
if local_file_path is not None:
path = local_file_path
with open(path) as csv_file:
load_csv(csv_file)
path = str(
files(parsedmarc.resources.maps).joinpath("base_reverse_dns_map.csv")
)
if local_file_path is not None:
path = local_file_path
with open(path) as csv_file:
load_csv(csv_file)
try:
service = reverse_dns_map[base_domain]
except KeyError:

View File

@@ -1,6 +1,6 @@
[build-system]
requires = [
"hatchling>=1.8.1",
"hatchling>=1.27.0",
]
build-backend = "hatchling.build"
@@ -59,7 +59,7 @@ dependencies = [
[project.optional-dependencies]
build = [
"hatch",
"hatch>=1.14.0",
"myst-parser[linkify]",
"nose",
"pytest",

25
sortmaps.py Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python3
import os
import glob
import csv
maps_dir = os.path.join("parsedmarc", "resources", "maps")
csv_files = glob.glob(os.path.join(maps_dir, "*.csv"))
def sort_csv(filepath, column=0):
with open(filepath, mode="r", newline="") as infile:
reader = csv.reader(infile)
header = next(reader)
sorted_rows = sorted(reader, key=lambda row: row[column])
with open(filepath, mode="w", newline="\n") as outfile:
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerows(sorted_rows)
for csv_file in csv_files:
sort_csv(csv_file)