From ae5d20ecf5fa3cbeacf63d115cf9ce1af34f3a5c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 13:21:54 -0500 Subject: [PATCH] Fix duplicate detection for normalized aggregate reports in Elasticsearch/OpenSearch (#666) Change date_begin/date_end queries from exact match to range queries (gte/lte) so that previously saved normalized time buckets are correctly detected as duplicates within the original report's date range. Co-authored-by: seanthegeek <44679+seanthegeek@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: seanthegeek <44679+seanthegeek@users.noreply.github.com> --- parsedmarc/elastic.py | 4 ++-- parsedmarc/opensearch.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parsedmarc/elastic.py b/parsedmarc/elastic.py index b109d1f..6193548 100644 --- a/parsedmarc/elastic.py +++ b/parsedmarc/elastic.py @@ -413,8 +413,8 @@ def save_aggregate_report_to_elasticsearch( org_name_query = Q(dict(match_phrase=dict(org_name=org_name))) # type: ignore report_id_query = Q(dict(match_phrase=dict(report_id=report_id))) # pyright: ignore[reportArgumentType] domain_query = Q(dict(match_phrase={"published_policy.domain": domain})) # pyright: ignore[reportArgumentType] - begin_date_query = Q(dict(match=dict(date_begin=begin_date))) # pyright: ignore[reportArgumentType] - end_date_query = Q(dict(match=dict(date_end=end_date))) # pyright: ignore[reportArgumentType] + begin_date_query = Q(dict(range=dict(date_begin=dict(gte=begin_date)))) # pyright: ignore[reportArgumentType] + end_date_query = Q(dict(range=dict(date_end=dict(lte=end_date)))) # pyright: ignore[reportArgumentType] if index_suffix is not None: search_index = "dmarc_aggregate_{0}*".format(index_suffix) diff --git a/parsedmarc/opensearch.py b/parsedmarc/opensearch.py index ca0ffe3..c817515 100644 --- a/parsedmarc/opensearch.py +++ b/parsedmarc/opensearch.py @@ -413,8 +413,8 @@ def save_aggregate_report_to_opensearch( org_name_query = Q(dict(match_phrase=dict(org_name=org_name))) report_id_query = Q(dict(match_phrase=dict(report_id=report_id))) domain_query = Q(dict(match_phrase={"published_policy.domain": domain})) - begin_date_query = Q(dict(match=dict(date_begin=begin_date))) - end_date_query = Q(dict(match=dict(date_end=end_date))) + begin_date_query = Q(dict(range=dict(date_begin=dict(gte=begin_date)))) + end_date_query = Q(dict(range=dict(date_end=dict(lte=end_date)))) if index_suffix is not None: search_index = "dmarc_aggregate_{0}*".format(index_suffix)