From 46e694502defebc3bb3eb12a9fdef15f0c962890 Mon Sep 17 00:00:00 2001
From: Sean Whalen <44679+seanthegeek@users.noreply.github.com>
Date: Thu, 4 Jun 2026 10:36:49 -0400
Subject: [PATCH] Detect aggregate reports by "xml_schema" instead of "domain"

xml_schema is aggregate-only (failure/SMTP TLS rows don't carry it) and a
distinctive, non-generic field name, addressing the concern that "domain"
could be confused with other logs. parsedmarc defaults xml_schema to "draft"
when the report omits <version> (parsedmarc/__init__.py:832), so it survives a
missing version element -- unlike a field with no default.

It is also a native JSON string straight out of the json{} filter, so unlike
dmarc_aligned it needs no convert step to be testable, keeping detection
independent of the type-conversion in step 1b. xml_schema is added to the
pre-json init block (required for any if-tested field); domain stays
initialized since it is still mapped to target.hostname.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 google_secops_parser/README.md       |  2 +-
 google_secops_parser/parsedmarc.conf | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/google_secops_parser/README.md b/google_secops_parser/README.md
index ba403f7..8ab4df6 100644
--- a/google_secops_parser/README.md
+++ b/google_secops_parser/README.md
@@ -27,7 +27,7 @@ detects them by a field unique to each and maps them as follows:
 
 | parsedmarc report | Detected by | UDM `metadata.event_type` |
 |---|---|---|
-| DMARC aggregate | `domain` | `EMAIL_TRANSACTION` |
+| DMARC aggregate | `xml_schema` | `EMAIL_TRANSACTION` |
 | DMARC failure | `feedback_type` | `EMAIL_TRANSACTION` |
 | SMTP TLS (RFC 8460) | `policy_type` | `GENERIC_EVENT` |
 
diff --git a/google_secops_parser/parsedmarc.conf b/google_secops_parser/parsedmarc.conf
index a3f4b37..d4dee5a 100644
--- a/google_secops_parser/parsedmarc.conf
+++ b/google_secops_parser/parsedmarc.conf
@@ -10,7 +10,7 @@ filter {
   #
   # parsedmarc emits three flat JSON shapes, one object per syslog line, via the
   # CSV-row serializers (parsed_aggregate/failure/smtp_tls_reports_to_csv_rows):
-  #   * DMARC aggregate report record  -> detected by "domain"
+  #   * DMARC aggregate report record  -> detected by "xml_schema"
   #   * DMARC failure  report record   -> detected by "feedback_type"
   #   * SMTP TLS        report record   -> detected by "policy_type"
   #
@@ -81,9 +81,10 @@ filter {
       # report-type detection
       "feedback_type" => ""
       "policy_type"   => ""
-      "domain"        => ""
+      "xml_schema"    => ""
 
       # aggregate
+      "domain" => ""
       "report_id" => ""
       "org_name" => ""
       "org_email" => ""
@@ -207,16 +208,19 @@ filter {
   #    and unique to each shape:
   #      feedback_type -> failure
   #      policy_type   -> smtp_tls
-  #      domain        -> aggregate  (the reported From-domain; a required
-  #                       element of every aggregate record)
-  #    domain is preferred over header_from (which can be empty when a record
-  #    carries no identifiers) and over adkim (a defaulted policy field).
+  #      xml_schema    -> aggregate
+  #    xml_schema is aggregate-only and parsedmarc defaults it to "draft" when
+  #    the report omits <version> (parsedmarc/__init__.py), so it survives a
+  #    missing version. It is preferred over: header_from (can be empty when a
+  #    record carries no identifiers), adkim (a defaulted policy field), domain
+  #    (a generic name), and dmarc_aligned (a boolean that only becomes testable
+  #    after the convert in step 1b -- detection should not depend on that).
   # ---------------------------------------------------------------------------
   if [feedback_type] {
     mutate { replace => { "report_type" => "failure" } }
   } else if [policy_type] {
     mutate { replace => { "report_type" => "smtp_tls" } }
-  } else if [domain] {
+  } else if [xml_schema] {
     mutate { replace => { "report_type" => "aggregate" } }
   }