From f91588a124fcf6873ab66f00c2c0a197c25bea46 Mon Sep 17 00:00:00 2001 From: The n6 Development Team Date: Sun, 28 Nov 2021 22:48:41 +0100 Subject: [PATCH] Version 3.0.0-beta2 --- .gitignore | 1 + .n6-version | 2 +- CHANGELOG.md | 9 +- N6AdminPanel/n6adminpanel/admin_panel.conf | 4 +- .../n6adminpanel/mail_notices_helpers.py | 43 +- .../n6adminpanel/org_request_helpers.py | 16 +- N6Core/n6/base/config.py | 5 +- N6Core/n6/collectors/generic.py | 14 +- N6Core/n6/data/conf/70_abuse_ch.conf | 3 + N6Core/n6/parsers/generic.py | 17 + N6Core/n6/tests/utils/test_aggregator.py | 25 +- N6Core/n6/tests/utils/test_enrich.py | 14 +- N6Core/n6/utils/aggregator.py | 23 +- N6Core/n6/utils/enrich.py | 10 +- N6Core/n6/utils/recorder_conf_generator.py | 384 ---- N6Core/setup.py | 8 +- .../n6corelib/pki_related_test_helpers.py | 2 +- N6CoreLib/setup.py | 7 +- N6DataPipeline/console_scripts | 4 +- N6DataPipeline/n6datapipeline/aggregator.py | 400 ++++ N6DataPipeline/n6datapipeline/base.py | 47 +- .../n6datapipeline/data/conf/00_global.conf | 16 + .../n6datapipeline/data/conf/09_auth_db.conf | 35 + .../data/conf/11_jinja_rendering.conf | 59 + .../n6datapipeline/data/conf/11_mailing.conf | 242 +++ .../n6datapipeline/data/templates/.gitkeep | 0 N6DataPipeline/n6datapipeline/enrich.py | 298 +++ .../n6datapipeline/intelmq/helpers.py | 6 +- .../intelmq/utils/intelmq_adapter.py | 7 +- .../intelmq/utils/intelmq_converter.py | 2 +- .../n6datapipeline/tests/test_aggregator.py | 1866 +++++++++++++++++ .../n6datapipeline/tests/test_enrich.py | 1377 ++++++++++++ N6DataPipeline/setup.py | 2 +- N6DataSources/console_scripts | 7 +- .../n6datasources/collectors/__init__.py | 0 .../n6datasources/collectors/abuse_ch.py | 77 + .../n6datasources/collectors/base.py | 1153 ++++++++++ .../data/conf/01_global_parsers.conf | 3 + .../n6datasources/data/conf/70_abuse_ch.conf | 12 + .../n6datasources/parsers/__init__.py | 0 .../n6datasources/parsers/abuse_ch.py | 45 + N6DataSources/n6datasources/parsers/base.py | 904 ++++++++ .../tests/collectors/__init__.py | 0 .../n6datasources/tests/parsers/__init__.py | 0 N6Lib-py2/n6lib/auth_db/scripts.py | 2 +- N6Lib-py2/n6lib/csv_helpers.py | 45 +- N6Lib-py2/setup.py | 6 +- N6Lib/n6lib/auth_db/api.py | 94 +- N6Lib/n6lib/class_helpers.py | 104 +- N6Lib/n6lib/common_helpers.py | 21 +- N6Lib/n6lib/csv_helpers.py | 60 +- .../pyramid_commons/_generic_view_mixins.py | 12 +- .../n6lib/pyramid_commons/_pyramid_commons.py | 13 +- N6Portal/n6portal/__init__.py | 1 + .../n6sdk/_api_test_tool/api_test_tool.py | 2 +- N6SDK-py2/setup.py | 4 +- do_setup.py | 1 - docker-compose.yml | 4 +- docker/base/Dockerfile | 10 +- docker/web/Dockerfile | 2 +- docker/worker/Dockerfile | 30 +- docs/data_flow.png | Bin 0 -> 63916 bytes docs/data_flow_overview.md | 5 + docs/docker.md | 5 + docs/guides/intelmq/config.md | 2 +- docs/guides/intelmq/index.md | 6 +- docs/guides/intelmq/running.md | 3 +- docs/guides/new_source/collectors/command.md | 4 +- docs/guides/new_source/collectors/index.md | 84 +- docs/guides/new_source/index.md | 39 +- docs/guides/new_source/parsers/index.md | 118 +- docs/installation/img/pipeline.png | Bin 43145 -> 0 bytes docs/installation/index.md | 5 + docs/installation/pipeline.md | 7 +- docs/installation/supervisord.md | 67 +- etc/n6/admin_panel.conf | 4 +- etc/supervisord/get_parsers_conf.py | 27 +- etc/supervisord/get_parsers_conf_py2k.py | 51 + etc/supervisord/program_template.tmpl | 4 +- ...201908.conf => program_template_py2k.tmpl} | 4 +- etc/supervisord/programs/n6aggregator.conf | 4 +- etc/supervisord/programs/n6archiveraw.conf | 4 +- etc/supervisord/programs/n6comparator.conf | 4 +- etc/supervisord/programs/n6enrich.conf | 4 +- etc/supervisord/programs/n6filter.conf | 4 +- etc/supervisord/programs/n6recorder.conf | 4 +- .../n6aggregator.conf} | 4 +- .../programs_py2k/n6archiveraw.conf | 15 + .../programs_py2k/n6comparator.conf | 15 + etc/supervisord/programs_py2k/n6enrich.conf | 15 + etc/supervisord/programs_py2k/n6filter.conf | 15 + .../n6recorder.conf} | 6 +- etc/supervisord/supervisord_py2k.conf | 41 + mkdocs.yml | 17 +- test_do_setup.py | 8 +- 95 files changed, 7325 insertions(+), 830 deletions(-) delete mode 100644 N6Core/n6/utils/recorder_conf_generator.py create mode 100644 N6DataPipeline/n6datapipeline/aggregator.py create mode 100644 N6DataPipeline/n6datapipeline/data/conf/00_global.conf create mode 100644 N6DataPipeline/n6datapipeline/data/conf/09_auth_db.conf create mode 100644 N6DataPipeline/n6datapipeline/data/conf/11_jinja_rendering.conf create mode 100644 N6DataPipeline/n6datapipeline/data/conf/11_mailing.conf create mode 100644 N6DataPipeline/n6datapipeline/data/templates/.gitkeep create mode 100644 N6DataPipeline/n6datapipeline/enrich.py create mode 100644 N6DataPipeline/n6datapipeline/tests/test_aggregator.py create mode 100644 N6DataPipeline/n6datapipeline/tests/test_enrich.py create mode 100644 N6DataSources/n6datasources/collectors/__init__.py create mode 100644 N6DataSources/n6datasources/collectors/abuse_ch.py create mode 100644 N6DataSources/n6datasources/collectors/base.py create mode 100644 N6DataSources/n6datasources/data/conf/01_global_parsers.conf create mode 100644 N6DataSources/n6datasources/data/conf/70_abuse_ch.conf create mode 100644 N6DataSources/n6datasources/parsers/__init__.py create mode 100644 N6DataSources/n6datasources/parsers/abuse_ch.py create mode 100644 N6DataSources/n6datasources/parsers/base.py create mode 100644 N6DataSources/n6datasources/tests/collectors/__init__.py create mode 100644 N6DataSources/n6datasources/tests/parsers/__init__.py create mode 100644 docs/data_flow.png create mode 100644 docs/data_flow_overview.md delete mode 100644 docs/installation/img/pipeline.png create mode 100644 etc/supervisord/get_parsers_conf_py2k.py rename etc/supervisord/{programs/n6parser_abusechfeodotracker201908.conf => program_template_py2k.tmpl} (83%) rename etc/supervisord/{programs/n6parser_abusechsslblacklist201902.conf => programs_py2k/n6aggregator.conf} (83%) create mode 100644 etc/supervisord/programs_py2k/n6archiveraw.conf create mode 100644 etc/supervisord/programs_py2k/n6comparator.conf create mode 100644 etc/supervisord/programs_py2k/n6enrich.conf create mode 100644 etc/supervisord/programs_py2k/n6filter.conf rename etc/supervisord/{programs/n6parser_spam404.conf => programs_py2k/n6recorder.conf} (88%) create mode 100644 etc/supervisord/supervisord_py2k.conf diff --git a/.gitignore b/.gitignore index 6abaffc..30a261d 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ ENV/ n6-env/ env.bak/ venv.bak/ +env_py3k/ # logs, runtime data *.log diff --git a/.n6-version b/.n6-version index a1513ac..2aa4d8f 100644 --- a/.n6-version +++ b/.n6-version @@ -1 +1 @@ -3.0.0b1 \ No newline at end of file +3.0.0b2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 88dc02a..511b7e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,13 @@ # Changelog -Starting with 3.x.x release series, all notable changes applied to the -code of _n6_ are continuously documented in this file. +Starting with the 3.0.0 release, all notable changes applied to +the code of _n6_ will be continuously documented in this file. The format of this file is based, to much extent, on [Keep a Changelog](https://keepachangelog.com/). -## 3.0.0b1 - 2021-10-13 +## 3.0.0b... (beta releases...) - since 2021-10-13... + +TBD in the description of the 3.0.0 final release (soon...). -TBD diff --git a/N6AdminPanel/n6adminpanel/admin_panel.conf b/N6AdminPanel/n6adminpanel/admin_panel.conf index 9027561..846940d 100644 --- a/N6AdminPanel/n6adminpanel/admin_panel.conf +++ b/N6AdminPanel/n6adminpanel/admin_panel.conf @@ -19,7 +19,7 @@ ## IMPORTANT: the following 3 config sections should be uncommented ## and adjusted *ONLY* if the n6 Admin Panel application does *NOT* ## have access to the `09_auth_db.conf` file (being, typically, a part -## of the N6Core/N6Pipeline configuration) which (typically) already +## of the N6Core/N6DataPipeline configuration) which (typically) already ## contains these sections! # #[auth_db] @@ -65,7 +65,7 @@ ## IMPORTANT: the following 3 config sections should be kept here ## uncommented *ONLY* if the n6 Admin Panel application does *NOT* have ## access to the `11_mailing.conf` and `11_jinja_rendering.conf` files -## which, if exist (as a part of the N6Core/N6Pipeline configuration), +## which, if exist (as a part of the N6Core/N6DataPipeline configuration), ## typically already contain these sections! diff --git a/N6AdminPanel/n6adminpanel/mail_notices_helpers.py b/N6AdminPanel/n6adminpanel/mail_notices_helpers.py index 43874a4..aacf9c6 100644 --- a/N6AdminPanel/n6adminpanel/mail_notices_helpers.py +++ b/N6AdminPanel/n6adminpanel/mail_notices_helpers.py @@ -1,18 +1,21 @@ # Copyright (c) 2021 NASK. All rights reserved. -from typing import ( - Iterable, - Union, -) +from collections.abc import Iterable +from typing import Union from flask import ( flash, g, ) +from n6lib.auth_db.api import AuthDatabaseAPILookupError from n6lib.common_helpers import ascii_str +class NoRecipients(Exception): + pass + + class MailNoticesMixin(object): def try_to_send_mail_notices(self, notice_key, **get_notice_data_kwargs): @@ -21,10 +24,16 @@ def try_to_send_mail_notices(self, notice_key, **get_notice_data_kwargs): 'for notice_key={!a}.'.format(ascii_str(notice_key))) flash(msg, 'warning') return - notice_data = self.get_notice_data(**get_notice_data_kwargs) + try: + notice_data = self.get_notice_data(**get_notice_data_kwargs) + notice_recipients = list(self.get_notice_recipients(notice_data)) + if not notice_recipients: + raise NoRecipients('no matching non-blocked user(s) could be found') + except NoRecipients as exc: + flash(f'No e-mail notices could be sent because {exc}!', 'error') + return notice_lang = self.get_notice_lang(notice_data) assert notice_lang is None or isinstance(notice_lang, str) and len(notice_lang) == 2 - notice_recipients = list(self.get_notice_recipients(notice_data)) gathered_ok_recipients = [] with g.n6_mail_notices_api.dispatcher(notice_key, suppress_and_log_smtp_exc=True) as dispatch: @@ -43,18 +52,20 @@ def try_to_send_mail_notices(self, notice_key, **get_notice_data_kwargs): # (The following hooks can be overridden in subclasses.) - def get_notice_data(self, user_login): - # type: (...) -> dict - with g.n6_auth_manage_api_adapter as api: - user_and_org_basic_info = api.get_user_and_org_basic_info(user_login) + def get_notice_data(self, user_login) -> dict: + try: + with g.n6_auth_manage_api_adapter as api: + if api.is_user_blocked(user_login): + raise NoRecipients('the user {user_login!a} is blocked') + user_and_org_basic_info = api.get_user_and_org_basic_info(user_login) + except AuthDatabaseAPILookupError: + raise NoRecipients('the user {user_login!a} does not exist') return dict( user_and_org_basic_info, user_login=user_login) - def get_notice_lang(self, notice_data): - # type: (dict) -> Union[str, None] - return notice_data['lang'] - - def get_notice_recipients(self, notice_data): - # type: (dict) -> Iterable[str] + def get_notice_recipients(self, notice_data: dict) -> Iterable[str]: return [notice_data['user_login']] + + def get_notice_lang(self, notice_data: dict) -> Union[str, None]: + return notice_data['lang'] diff --git a/N6AdminPanel/n6adminpanel/org_request_helpers.py b/N6AdminPanel/n6adminpanel/org_request_helpers.py index 05b7ad5..3e8ca9b 100644 --- a/N6AdminPanel/n6adminpanel/org_request_helpers.py +++ b/N6AdminPanel/n6adminpanel/org_request_helpers.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2021 NASK. All rights reserved. +from collections.abc import Iterable import html import re import string @@ -591,19 +592,18 @@ def _after_status_transition_to_other(self, self.try_to_send_mail_notices(notice_key='org_config_update_rejected', req_id=org_request.id) - def get_notice_data(self, req_id): - # type: (...) -> dict + def get_notice_data(self, req_id) -> dict: notice_data = g.n6_org_config_info notice_data['update_info']['update_request_id'] = req_id return notice_data - def get_notice_lang(self, notice_data): - # type: (dict) -> Union[str, None] - return notice_data['notification_language'] # TODO?: separate per-user setting?... - - def get_notice_recipients(self, notice_data): + def get_notice_recipients(self, notice_data: dict) -> Iterable[str]: with g.n6_auth_manage_api_adapter as api: - return api.get_org_user_logins(org_id=notice_data['org_id']) + return api.get_org_user_logins(org_id=notice_data['org_id'], + only_nonblocked=True) + + def get_notice_lang(self, notice_data: dict) -> Union[str, None]: + return notice_data['notification_language'] # TODO?: separate per-user setting?... # diff --git a/N6Core/n6/base/config.py b/N6Core/n6/base/config.py index cb408c5..61887c2 100644 --- a/N6Core/n6/base/config.py +++ b/N6Core/n6/base/config.py @@ -95,9 +95,10 @@ def check_existing_dir_content(install_to, alternative_to): try: config_template_dir = 'n6/data/conf/' - files = resource_listdir(Requirement.parse("n6"), config_template_dir) + files = resource_listdir(Requirement.parse("n6core-py2"), config_template_dir) for f in files: - filename = resource_filename(Requirement.parse("n6"), os.path.join(config_template_dir, f)) + filename = resource_filename(Requirement.parse("n6core-py2"), + os.path.join(config_template_dir, f)) try: if not os.path.isdir(install_to): os.makedirs(install_to) diff --git a/N6Core/n6/collectors/generic.py b/N6Core/n6/collectors/generic.py index bc72845..0ee4a01 100644 --- a/N6Core/n6/collectors/generic.py +++ b/N6Core/n6/collectors/generic.py @@ -50,6 +50,7 @@ # # Exceptions +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... class n6CollectorException(Exception): pass @@ -73,6 +74,7 @@ def set_configuration(self): self.config = ConfigSection('') +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... class CollectorStateMixIn(object): """DO NOT USE THIS CLASS IN NEW CODE, USE ONLY CollectorWithStateMixin!""" @@ -112,6 +114,7 @@ def get_cache_file_name(self): return self.config['source'] + ".txt" +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... class CollectorStateMixInPlus(CollectorStateMixIn): """ @@ -650,6 +653,7 @@ def start_publishing(self): self.inner_stop() +# TODO: migrate it to `n6datasources.collectors.base` when needed... class BaseEmailSourceCollector(BaseOneShotCollector): """ @@ -690,6 +694,8 @@ def get_output_prop_kwargs(self, email_msg, **kwargs): return prop_kwargs +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# (use `n6datasources.collectors.base.BaseDownloadingCollector` instead) class BaseUrlDownloaderCollector(BaseCollector): config_group = None @@ -868,6 +874,9 @@ def _try_to_set_http_last_modified(self, headers): break +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# **unless** (TODO) modernized to use `BaseDownloadingCollector` +# (instead of `BaseUrlDownloaderCollector`). class BaseRSSCollector(BaseOneShotCollector, BaseUrlDownloaderCollector): type = 'stream' @@ -1477,6 +1486,8 @@ def obtain_orig_data(self): # # Script/entry point factories +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# (replaced by `n6datasources.collectors.base.AbstractBaseCollector.run_script()`) def generate_collector_main(collector_class): def collector_main(): with logging_configured(): @@ -1485,7 +1496,8 @@ def collector_main(): collector.run_handling() return collector_main - +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# (use `n6datasources.collectors.base.add_collector_entry_point_functions()` instead) def entry_point_factory(module): for collector_class in all_subclasses(AbstractBaseCollector): if (not collector_class.__module__.endswith('.generic') and diff --git a/N6Core/n6/data/conf/70_abuse_ch.conf b/N6Core/n6/data/conf/70_abuse_ch.conf index 8ffa234..fb5a66f 100644 --- a/N6Core/n6/data/conf/70_abuse_ch.conf +++ b/N6Core/n6/data/conf/70_abuse_ch.conf @@ -131,6 +131,9 @@ prefetch_count = 20 [AbuseChFeodoTracker201908Parser] prefetch_count = 1 +[AbuseChFeodoTracker202110Parser] +prefetch_count = 1 + [AbuseChPalevoDoms201406Parser] prefetch_count = 1 diff --git a/N6Core/n6/parsers/generic.py b/N6Core/n6/parsers/generic.py index d0374b5..53047fb 100644 --- a/N6Core/n6/parsers/generic.py +++ b/N6Core/n6/parsers/generic.py @@ -849,6 +849,9 @@ def get_bl_current_time_from_data(self, data, parsed): +# LEGACY STUFF -- we DO NOT want to migrate it to `n6datasources.parsers.base`. +# IF it is really needed in Py3, please (TODO?) migrate it to +# `n6datasources.parsers.base_legacy`. class TabDataParser(BaseParser): """ @@ -1000,6 +1003,9 @@ def process_row_fields(self, data, parsed, *fields): +# LEGACY STUFF -- we DO NOT want to migrate it to `n6datasources.parsers.base`. +# IF it is really needed in Py3, please (TODO?) migrate it to +# `n6datasources.parsers.base_legacy`. class BlackListTabDataParser(TabDataParser, BlackListParser): """ @@ -1012,6 +1018,10 @@ class BlackListTabDataParser(TabDataParser, BlackListParser): # (+ adding process_row_fields() method placeholder) # -- then update the wiki page about parsers... # XXX: is it tested? +# +# LEGACY STUFF -- we DO NOT want to migrate it to `n6datasources.parsers.base`. +# IF it is really needed in Py3, please (TODO?) migrate it to +# `n6datasources.parsers.base_legacy`. class XmlDataParser(BaseParser): """ @@ -1057,6 +1067,11 @@ def iter_entry(self, data): +# +# Script/entry point factories + +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# (replaced by `n6datasources.parsers.base.BaseParser.run_script()`) def generate_parser_main(parser_class): def parser_main(): with logging_configured(): @@ -1066,6 +1081,8 @@ def parser_main(): return parser_main +# LEGACY STUFF -- we DO NOT want to migrate it to n6datasources... +# (use `n6datasources.parsers.base.add_parser_entry_point_functions()` instead) def entry_point_factory(module): for parser_class in all_subclasses(BaseParser): if (not parser_class.__module__.endswith('.generic') and diff --git a/N6Core/n6/tests/utils/test_aggregator.py b/N6Core/n6/tests/utils/test_aggregator.py index 904a106..dec3105 100644 --- a/N6Core/n6/tests/utils/test_aggregator.py +++ b/N6Core/n6/tests/utils/test_aggregator.py @@ -913,7 +913,7 @@ def test_publish_event(self, count, expected_body_content): self.assertEqual(len(self._aggregator.publish_output.mock_calls), 1) publish_output_kwargs = self._aggregator.publish_output.mock_calls[0][-1] - self.assertEqual(set(publish_output_kwargs.iterkeys()), {"routing_key", "body"}) + self.assertEqual(set(publish_output_kwargs.keys()), {"routing_key", "body"}) self.assertEqual(publish_output_kwargs["routing_key"], expected_routing_key) self.assertJsonEqual(publish_output_kwargs["body"], expected_body_content) @@ -1544,19 +1544,26 @@ def test_store_restore_state(self): # the state, but there is no access to the given path; first, # make sure there actually is no access to the given path tmp_db_path = "/root/example.pickle" - if not os.access(tmp_db_path, os.W_OK): - with patch.object(self._adw, "dbpath", tmp_db_path): - self.assertRaises(IOError, self._adw.store_state()) + assert not os.access(tmp_db_path, os.W_OK), ('The test case relies on the assumption that ' + 'the user running the tests does not ' + 'have permission to write ' + 'to: {!r}'.format(tmp_db_path)) + self._adw.dbpath = tmp_db_path + with patch('n6.utils.aggregator.LOGGER') as patched_logger: + self._adw.store_state() + patched_logger.error.assert_called_once() # assert the exception is being raised when trying to restore # the state from nonexistent file; first, safely create # a temporary file, then close and remove it, so the path # most likely does not exist with tempfile.NamedTemporaryFile() as fp: tmp_db_path = fp.name - if not os.path.exists(tmp_db_path): - with patch.object(self._adw, "dbpath", tmp_db_path), \ - self.assertRaisesRegexp(IOError, r"No such file or directory"): - self._adw.restore_state() + assert not os.path.exists(tmp_db_path), ('The randomly generated temporary directory: ' + '{!r} still exists, so the test cannot ' + 'be correctly performed'.format(tmp_db_path)) + with patch.object(self._adw, "dbpath", tmp_db_path), \ + self.assertRaisesRegexp(IOError, r"No such file or directory"): + self._adw.restore_state() @foreach(_test_process_new_message_data) def test_process_new_message(self, messages, expected_source_time, @@ -1695,7 +1702,7 @@ def test_generate_suppressed_events_after_timeout(self, datetime.timedelta(*args, **kw)) # actual call generated_events = list(self._adw.generate_suppresed_events_after_timeout()) - expected_events = [event for source, vals in source_to_expected_events.iteritems() + expected_events = [event for source, vals in source_to_expected_events.items() if source in expected_inactive_sources for event in vals] self.assertEqual(expected_events, generated_events) diff --git a/N6Core/n6/tests/utils/test_enrich.py b/N6Core/n6/tests/utils/test_enrich.py index 07e4295..71621dd 100644 --- a/N6Core/n6/tests/utils/test_enrich.py +++ b/N6Core/n6/tests/utils/test_enrich.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -# Copyright (c) 2013-2020 NASK. All rights reserved. +# Copyright (c) 2013-2021 NASK. All rights reserved. import datetime import hashlib import os import unittest -import iptools +import iptools #3 --remove-replace iptools import mock from geoip2.errors import GeoIP2Error from dns.exception import DNSException @@ -640,7 +640,7 @@ def test_routing_key_modified(self): def test__get_excluded_ips__with_excluded_ips_in_config(self): self._prepare_config_for_excluded_ips(['1.1.1.1', '2.2.2.2', '3.3.3.3']) - expected = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + expected = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') #3 --replace iptools result = self.enricher._get_excluded_ips() self.assertItemsEqual(expected, result) @@ -701,7 +701,7 @@ def test__filter_out_excluded_ips__with_excluded_ips_being_None(self): self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) def test__filter_out_excluded_ips__with_no_ip_in_excluded_ips(self): - self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') #3 --replace iptools data = RecordDict({ "url": "http://www.nask.pl/asd", "address": [{'ip': '1.1.1.5'}, {'ip': '2.1.1.1'}], @@ -717,7 +717,7 @@ def test__filter_out_excluded_ips__with_no_ip_in_excluded_ips(self): self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) def test__filter_out_excluded_ips__with_ip_in_excluded_ips__1(self): - self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') #3 --replace iptools data = RecordDict({ "url": "http://www.nask.pl/asd", "address": [{'ip': '1.1.1.1'}, {'ip': '1.1.1.6'}], @@ -735,7 +735,7 @@ def test__filter_out_excluded_ips__with_ip_in_excluded_ips__1(self): self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) def test__filter_out_excluded_ips__with_ip_in_excluded_ips__2(self): - self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') #3 --replace iptools data = RecordDict({ "url": "http://www.nask.pl/asd", "address": [{'ip': '1.1.1.1', 'asn': 1234}], @@ -753,7 +753,7 @@ def test__filter_out_excluded_ips__with_ip_in_excluded_ips__2(self): self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) def test__filter_out_excluded_ips__with_range_of_ips(self): - self.enricher.excluded_ips = iptools.IpRangeList('3.0.0.0/8') + self.enricher.excluded_ips = iptools.IpRangeList('3.0.0.0/8') #3 --replace iptools data = RecordDict({ "url": "http://www.nask.pl/asd", "address": [ diff --git a/N6Core/n6/utils/aggregator.py b/N6Core/n6/utils/aggregator.py index a276eae..f11670c 100644 --- a/N6Core/n6/utils/aggregator.py +++ b/N6Core/n6/utils/aggregator.py @@ -1,7 +1,7 @@ # Copyright (c) 2013-2021 NASK. All rights reserved. import collections -import cPickle +import pickle import datetime import json import os @@ -11,6 +11,7 @@ QueuedBase, n6QueueProcessingException, ) +from n6lib.common_helpers import open_file from n6lib.config import ConfigMixin from n6lib.datetime_helpers import parse_iso_datetime_to_utc from n6lib.log_helpers import ( @@ -131,7 +132,7 @@ def generate_suppressed_events(self): cutoff_time = self.time - datetime.timedelta(hours=AGGREGATE_WAIT) cutoff_check_complete = False for_cleanup = [] - for k, v in self.groups.iteritems(): + for k, v in self.groups.items(): if v.until >= cutoff_time: cutoff_check_complete = True if cutoff_check_complete and v.until.date() == self.time.date(): @@ -145,7 +146,7 @@ def generate_suppressed_events(self): # generate suppressed events from buffer cutoff_time = self.time - self.time_tolerance for_cleanup = [] - for k, v in self.buffer.iteritems(): + for k, v in self.buffer.items(): if v.until >= cutoff_time: break for_cleanup.append(k) @@ -155,10 +156,10 @@ def generate_suppressed_events(self): del self.buffer[k] def generate_suppressed_events_after_inactive(self): - for _, v in self.buffer.iteritems(): + for _, v in self.buffer.items(): # XXX: see ticket #6243 (check whether here is OK or also will need to be changed) yield 'suppressed', v.to_dict() if v.count > 1 else None - for _, v in self.groups.iteritems(): + for _, v in self.groups.items(): # XXX: see ticket #6243 (check whether here is OK or also will need to be changed) yield 'suppressed', v.to_dict() if v.count > 1 else None self.groups.clear() @@ -207,14 +208,14 @@ def __init__(self, dbpath, time_tolerance, time_tolerance_per_source): def store_state(self): try: - with open(self.dbpath, 'w') as f: - cPickle.dump(self.aggr_data, f) + with open_file(self.dbpath, 'wb') as f: + pickle.dump(self.aggr_data, f, protocol=2) except IOError: LOGGER.error('Error saving state to: %r', self.dbpath) def restore_state(self): - with open(self.dbpath, 'r') as f: - self.aggr_data = cPickle.load(f) + with open_file(self.dbpath, 'rb') as f: + self.aggr_data = pickle.load(f) def process_new_message(self, data): """ @@ -252,7 +253,7 @@ def generate_suppresed_events_after_timeout(self): """ LOGGER.debug('Detecting inactive sources after tick timout') time_now = datetime.datetime.utcnow() - for source in self.aggr_data.sources.itervalues(): + for source in self.aggr_data.sources.values(): LOGGER.debug('Checking source: %r', source) if source.last_event + datetime.timedelta(hours=SOURCE_INACTIVITY_TIMEOUT) < time_now: LOGGER.debug('Source inactive. Generating suppressed events') @@ -287,7 +288,7 @@ def __init__(self, **kwargs): self.aggregator_config = self.get_config_section() dbpath_dirname = os.path.dirname(self.aggregator_config['dbpath']) try: - os.makedirs(dbpath_dirname, 0700) + os.makedirs(dbpath_dirname, 0o700) except OSError: pass super(Aggregator, self).__init__(**kwargs) diff --git a/N6Core/n6/utils/enrich.py b/N6Core/n6/utils/enrich.py index 69af0df..e415198 100644 --- a/N6Core/n6/utils/enrich.py +++ b/N6Core/n6/utils/enrich.py @@ -2,14 +2,14 @@ import collections import os -import urlparse +import urllib.parse import dns.resolver # TODO: After migration to Pyton 3.x: remove the `iptools` dependency, # adjusting our code to use std lib's `ipaddress` (maybe also # adding IPv4/v6/both-dedicated config converters?), and/or our # own existing IP-address-related helpers... -import iptools +import iptools #3 --remove-replace iptools import maxminddb.const from dns.exception import DNSException from geoip2 import database, errors @@ -70,7 +70,7 @@ def __init__(self, **kwargs): def _get_excluded_ips(self): if self._enrich_config['excluded_ips']: - return iptools.IpRangeList(*self._enrich_config['excluded_ips']) + return iptools.IpRangeList(*self._enrich_config['excluded_ips']) #3 --replace iptools return None def _setup_dnsresolver(self, dnshost, dnsport): @@ -244,13 +244,13 @@ def _final_sanity_assertions(self, data): for name in enriched_keys) assert all( set(addr_keys).issubset(ip_to_addr[ip]) - for ip, addr_keys in ip_to_enriched_address_keys.iteritems()) + for ip, addr_keys in list(ip_to_enriched_address_keys.items())) # # Resolution helpers def url_to_fqdn_or_ip(self, url): - parsed_url = urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if parsed_url.netloc.endswith(':'): # URL is probably wrong -- something like: "http://http://..." return '' diff --git a/N6Core/n6/utils/recorder_conf_generator.py b/N6Core/n6/utils/recorder_conf_generator.py deleted file mode 100644 index b7a05f4..0000000 --- a/N6Core/n6/utils/recorder_conf_generator.py +++ /dev/null @@ -1,384 +0,0 @@ -# Copyright (c) 2020 NASK. All rights reserved. - -import argparse -import sys - -import os.path as osp - -from n6lib.data_spec.fields import ( - FieldValueError, - SourceField, -) - -CONF_PATTERN = """ -[program:{prog}] -command={command} ; the program (relative uses PATH, can take args) -process_name=%(program_name)s ; process_name expr (default %(program_name)s) -numprocs=1 ; number of processes copies to start (def 1) - -autorestart=unexpected ; whether/when to restart (default: unexpected) -startsecs=1 ; number of secs prog must stay running (def. 1) -startretries=3 ; max # of serial start failures (default 3) -exitcodes=0 ; 'expected' exit codes for process (default 0) -stopsignal=INT ; signal used to kill process (default TERM) -stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) -stopasgroup=false ; send stop signal to the UNIX process group (default false) -killasgroup=false ; SIGKILL the UNIX process group (def false) - -environment=HOME="/home/dataman" -""" - - -def _print(msg, file=None): - if file is None: - file = sys.stdout - file.write(msg+"\n") - - -def print_err(msg, *args, **kwargs): - file = kwargs.pop('file', sys.stderr) - formatted = "[{}] ERROR: {}".format( - sys.argv[0], msg.format(*args, **kwargs)) - _print(formatted, file) - - -def print_msg(msg, *args, **kwargs): - _print("[{}] {}".format( - sys.argv[0], msg.format(*args, **kwargs))) - - -class RecorderConfigGenerationError(Exception): - """ - General purpose exception to signal error - during recorder supervisors' config generation. - - If the `RecorderConfigGenerator` would be called - from other code raising this exception instead - of quitting on error allows caller to recover. - """ - def __init__(self, msg): - super(RecorderConfigGenerationError, self).__init__() - self.exit_msg = msg - - -class RecorderConfigGenerator(object): - """ - Generates supervisor configuration files - for the recorders run with flag `--n6recorder-blacklist` - or `--n6recorder-non-blacklist`. - - Sources to generate the configuration for will - be read from the path passed under the `source_file_path`. - File under this path should contain each source in a separate - line in format `source_label.source_channel`. - Sources file can contain blank lines and/or comments - starting with a '#' character. - Example: - - # important sources - source_l1.channel1 - important.important_channel - - # rest of the sources - some_s.rest_channel - - Most of the implementation is transactional. - What it means is that if an error occurs then no change - is made to the outside environment. Exception to that - is writing configuration to files. - If some error occurs during this phase then the already - written files will not be reverted to the state before. - - To generate configuration files call mwthod `gen_and_write_source_conf()`. - To generate non-blacklist configuration file call method `gen_and_write_non_bl_conf()`. - """ - - N6RECORDER_BL_CONF_NAME_PATT = "n6recorder_bl_{}" - N6RECORDER_NON_BL_CONF_BAME = "n6recorder_non_blacklist" - - def __init__(self, source_file_path, dest_path, overwrite=False, skip_errors=True): - """ - Initializes `RecorderConfigGenerator` instance. - - Args/kwargs: - `source_file_path`: - *Path to the file* containing sources to generate - the configurations for. - `dest_path`: - *Path to the directory* where the - configuration files will be generated to. - `overwrite`: - Should the content of the configuration files - be overwritten if the files already exists. - If this flag is not set and any of the files exist - then exception will be risen without any changes - being made to the outside environment. - Defaults to `False`. - `skip_errors`: - By default `RecorderConfigGenerator` halts on every - error. This flag changes this behavior so that - not all errors result in exception being thrown. - Instead the execution proceeds as if the error never - occured and a cause of the error is ignored. - If this flag is set then the following - errors will be skipped: - - wrong source format in the sources file, - - configuration file already exists and overwriting flag is not set. - - Raises: - `RecorderConfigGenerationError`: - If `source_file_path` is not an existing file or - `dest_path` is not an existing directory. - """ - super(RecorderConfigGenerator, self).__init__() - self.source_file_path = source_file_path - self.dest_path = dest_path - self.overwrite = overwrite - self.skip_errors = skip_errors - self._check_source_path() - self._check_dest_path() - - def _check_source_path(self): - if not osp.isfile(self.source_file_path): - raise RecorderConfigGenerationError( - "source file '{}' does not exist or is not a file".format( - self.source_file_path)) - - def _check_dest_path(self): - if not osp.isdir(self.dest_path): - raise RecorderConfigGenerationError( - "destination path '{}' does not exist or is not a directory".format( - self.dest_path)) - - # static helper functions - - @staticmethod - def generate_bl_recorder_conf(source): - """ - Creates a configuration for the blacklist recorder - for the passed source. - - Returns: - Created configuration as `str`. - """ - prog_fmt = RecorderConfigGenerator.N6RECORDER_BL_CONF_NAME_PATT - return CONF_PATTERN.format( - prog=prog_fmt.format(source.replace(".", "_")), - command="n6recorder --n6recorder-blacklist {}".format(source)) - - @staticmethod - def generate_non_bl_recorder_conf(): - """ - Creates a configuration for the non-blacklist - recorder. - - Returns: - Created configuration as `str`. - """ - return CONF_PATTERN.format( - prog=RecorderConfigGenerator.N6RECORDER_NON_BL_CONF_BAME, - command="n6recorder --n6recorder-non-blacklist") - - @staticmethod - def file_name_from_source(source): - """ - Creates a filename for the configuration file - from the given source. - - Returns: - Created filename as `str`. - """ - name_patt = RecorderConfigGenerator.N6RECORDER_BL_CONF_NAME_PATT - name_patt += ".conf" - return name_patt.format(source.replace(".", "_")) - - # logic implementation - - def get_source_configurations(self): - """ - Create a dictionary mapping source from the - sources file to the configuration generated for - this source. - - Returns: - Created dictionary. - - Raises: - `RecorderConfigGenerationError`: - If the `skip_errors` flag is set to `False` - and some source in the file has a wrong format. - """ - configs = {} - errors = [] - with open(self.source_file_path) as src: - for line, source in enumerate(src.readlines(), start=1): - source = source.rstrip() - if not source or self._is_comment(source): - continue - try: - source = SourceField().clean_result_value(source) - except FieldValueError as e: - err_msg = "({}:{}) {}".format(self.source_file_path, line, e) - if self.skip_errors: - print_msg("skipping error: {}".format(err_msg)) - continue - errors.append(err_msg) - continue - configs[source] = self.generate_bl_recorder_conf(source) - if errors: - for err in errors: - print_err(err) - raise RecorderConfigGenerationError( - "there were errors during config generation") - return configs - - def _is_comment(self, source): - return source.startswith('#') - - def _check_config_files(self, configs): - """ - This method checks if the path in the `configs` - dictionary exists. If so 3 things can be done: - - if `skip_errors` and `overwrite` flags are `False` - then an exception will be raised. - - if `skip_errors` is `True` and `overwrite` is `False` - then the path will be removed from the dictionary. - So that later no writes to it happen. - - if `overwrite` is `True` then nothing is done - and later the content of the file will be overwritten - with newly generated configuration. - - Args: - `configs`: - A dictionary mapping paths to the sources' - configuration files with configurations generated - for the sources. - - Raises: - `RecorderConfigGenerationError`: - If the `skip_errors` and `overwrite` flags are set to `False` - and one of the paths already exists. - """ - confs_to_del = [] - errors = [] - for conf_name in configs: - conf_path = osp.join(self.dest_path, conf_name) - if osp.exists(conf_path): - if not self.overwrite: - err_msg = ( - "config file '{}' already exists " - "and overwritting was not allowed " - "(use --overwrite if you want it " - "to be overwritten)").format(conf_path) - if self.skip_errors: - print_msg("skipping error: {}".format(err_msg)) - confs_to_del.append(conf_name) - continue - errors.append(err_msg) - else: - print_msg("config file '{}' will be overwritten", conf_path) - if errors: - for err in errors: - print_err(err) - raise RecorderConfigGenerationError( - "there were errors during files checking") - for conf in confs_to_del: - del configs[conf] - - def _write_configurations(self, configs): - """ - Writes configurations to their designated files - overwriting whatever content there was before. - - Args: - `configs`: - Dictionary mapping file paths to their - new content. - """ - for conf, content in configs.items(): - wrt_path = osp.join(self.dest_path, conf) - with open(wrt_path, 'w') as f: - f.write(content) - - def gen_and_write_non_bl_conf(self): - """ - Works like `gen_and_write_source_conf()` method but instead of - creating configuration for sources listed in the file - creates a single configuration file for the - non-blacklist recorder. - - Raises: - `RecorderConfigGenerationError`: - If the configuration file alredy exists and - the flags `skip_errors` and `overwrite` are - set to `False`. - """ - path = osp.join( - self.dest_path, - self.N6RECORDER_NON_BL_CONF_BAME + ".conf") - if osp.exists(path): - if not self.overwrite: - err_msg = ( - "config file '{}' already exists " - "and overwritting wasn't allowed " - "(use --overwrite if you want it " - " to be overwritten)").format(path) - if self.skip_errors: - print_msg("skipping error: {}".format(err_msg)) - return - raise RecorderConfigGenerationError(err_msg) - print_msg("config file '{}' will be overwritten", path) - with open(path, 'w') as f: - f.write(self.generate_non_bl_recorder_conf()) - - def gen_and_write_source_conf(self): - """ - Creates and writes the configuration files for the - sources in the source file to the destination path. - - Raises: - `RecorderConfigGenerationError`: - If there were errors in the called methods. - See other methods documentation for more details. - """ - configs = self.get_source_configurations() - configs = {self.file_name_from_source(k): v for k, v in configs.items()} - self._check_config_files(configs) - self._write_configurations(configs) - - -def get_argparser(): - parser = argparse.ArgumentParser( - description="Generate supervisor configuration in the given destination" - " directory for the sources given in the source file.") - parser.add_argument("source", - help="Path to the source file containing one source per line." - " Source is a string in format 'source_label.source_channel'") - parser.add_argument("dest", - help="Path to a directory to generate the config files to.") - parser.add_argument("-n", "--non-blacklist", action='store_true', - help="Additionaly to the bl recorders creates " - "configuration for the non blacklist recorder.") - parser.add_argument("-o", "--overwrite", action='store_true', - help="Should the configuration files be overwritten if already present") - parser.add_argument("-s", "--skip-errors", action='store_true', - help="If set then if possible script will try to skip on errors instead" - " of stopping execution" - "(for example: illegal value in the source file") - return parser - - -def main(): - args = get_argparser().parse_args() - try: - conf_generator = RecorderConfigGenerator( - args.source, args.dest, args.overwrite, args.skip_errors) - conf_generator.gen_and_write_source_conf() - if args.non_blacklist: - conf_generator.gen_and_write_non_bl_conf() - except RecorderConfigGenerationError as e: - print_err(e.exit_msg) - sys.exit(1) - -if __name__ == "__main__": - main() diff --git a/N6Core/setup.py b/N6Core/setup.py index 7fd8f65..8467a39 100644 --- a/N6Core/setup.py +++ b/N6Core/setup.py @@ -113,7 +113,11 @@ def find_collectors(): n6_version = get_n6_version('.n6-version') -requirements = ['n6sdk==' + n6_version, 'n6lib==' + n6_version, 'n6corelib==' + n6_version] +requirements = [ + 'n6sdk-py2==' + n6_version, + 'n6lib-py2==' + n6_version, + 'n6corelib-py2==' + n6_version, +] console_scripts_list = ['n6config = n6.base.config:install_default_config'] if not collectors_only: @@ -123,7 +127,7 @@ def find_collectors(): setup( - name="n6", + name="n6core-py2", version=n6_version, packages=find_packages(), diff --git a/N6CoreLib/n6corelib/pki_related_test_helpers.py b/N6CoreLib/n6corelib/pki_related_test_helpers.py index 6e0aea4..8e1b7f4 100644 --- a/N6CoreLib/n6corelib/pki_related_test_helpers.py +++ b/N6CoreLib/n6corelib/pki_related_test_helpers.py @@ -329,7 +329,7 @@ class _load_cert_metadata_by_label(_LoadByCertLabelMixin, def _get_pki_test_data_path(): try: return resource_filename( - Requirement.parse("n6lib"), + Requirement.parse("n6lib-py2"), 'n6lib/tests/certs_and_requests_for_testing/') finally: cleanup_resources() diff --git a/N6CoreLib/setup.py b/N6CoreLib/setup.py index 6a6353b..43a3523 100644 --- a/N6CoreLib/setup.py +++ b/N6CoreLib/setup.py @@ -57,7 +57,10 @@ def setup_data_line_generator(filename_base): pip_install = False setup_install = False -requirements = ['n6sdk==' + n6_version, 'n6lib==' + n6_version] +requirements = [ + 'n6sdk-py2==' + n6_version, + 'n6lib-py2==' + n6_version, +] requirements_pip = [] dep_links = [] for line in setup_data_line_generator('requirements'): @@ -81,7 +84,7 @@ def setup_data_line_generator(filename_base): setup( - name="n6corelib", + name="n6corelib-py2", version=n6_version, packages=find_packages(), diff --git a/N6DataPipeline/console_scripts b/N6DataPipeline/console_scripts index 17dd936..fb2e4ba 100644 --- a/N6DataPipeline/console_scripts +++ b/N6DataPipeline/console_scripts @@ -1,6 +1,6 @@ #n6archiveraw = n6datapipeline.archive_raw:main -#n6aggregator = n6datapipeline.aggregator:main -#n6enrich = n6datapipeline.enrich:main +n6aggregator = n6datapipeline.aggregator:main +n6enrich = n6datapipeline.enrich:main #n6comparator = n6datapipeline.comparator:main #n6filter = n6datapipeline.filter:main #n6recorder = n6datapipeline.recorder:main diff --git a/N6DataPipeline/n6datapipeline/aggregator.py b/N6DataPipeline/n6datapipeline/aggregator.py new file mode 100644 index 0000000..8798eca --- /dev/null +++ b/N6DataPipeline/n6datapipeline/aggregator.py @@ -0,0 +1,400 @@ +# Copyright (c) 2013-2021 NASK. All rights reserved. + +import collections +import pickle +import datetime +import json +import os +import os.path + +from n6datapipeline.base import ( + LegacyQueuedBase, + n6QueueProcessingException, +) +from n6lib.common_helpers import open_file +from n6lib.config import ConfigMixin +from n6lib.datetime_helpers import parse_iso_datetime_to_utc +from n6lib.log_helpers import ( + get_logger, + logging_configured, +) +from n6lib.record_dict import RecordDict + + +LOGGER = get_logger(__name__) + + +# in hours, time to wait for the next event before suppressed event is generated +AGGREGATE_WAIT = 12 + +# in hours, when the source is considered inactive (cleanup should be triggered) +SOURCE_INACTIVITY_TIMEOUT = 24 + +# in seconds, tick between checks of inactive sources +TICK_TIMEOUT = 3600 + + +class HiFreqEventData: + + def __init__(self, payload): + self.group = payload.get('_group') + self.until = parse_iso_datetime_to_utc(payload.get('time')) + self.first = parse_iso_datetime_to_utc(payload.get('time')) + self.count = 1 # XXX: see ticket #6243 + self.payload = payload + + def to_dict(self): + result = self.payload + result['count'] = self.count + result['until'] = str(self.until) + result['_first_time'] = str(self.first) + return result + + def update_payload(self, update_dict): + tmp = self.payload.copy() + tmp.update(update_dict) + self.payload = tmp + + +class SourceData: + + def __init__(self, time_tolerance): + self.time = None # current time tracked for source (based on event time) + # utc time of the last event (used to trigger cleanup if source is inactive) + self.last_event = None + self.groups = collections.OrderedDict() # groups aggregated for a given source + self.time_tolerance = datetime.timedelta(seconds=time_tolerance) + # buffer to store aggregated events until time_tolerance has passed + self.buffer = collections.OrderedDict() + + def update_time(self, event_time): + if event_time > self.time: + self.time = event_time + self.last_event = datetime.datetime.utcnow() + + def process_event(self, data): + event_time = parse_iso_datetime_to_utc(data['time']) + event = self.groups.get(data['_group']) + if self.time is None: + self.time = event_time + if event_time + self.time_tolerance < self.time: + if event is None or event.first > event_time: + LOGGER.error('Event out of order. Ignoring. Data: %s', data) + raise n6QueueProcessingException('Event out of order.') + else: + LOGGER.info('Event out of order, but not older than group\'s first event, ' + 'so it will be added to existing aggregate group. Data: %s', data) + event.until = max(event.until, event_time) + event.count += 1 # XXX: see ticket #6243 + return False + + if event is None: + if event_time < self.time: + # unordered event, self.buffer may contain suppressed event + LOGGER.debug("Unordered event of the '%s' group, '%s' source within time " + "tolerance. Check and update buffer.", data['_group'], data['source']) + buffered_event = self.buffer.get(data['_group']) + if buffered_event is not None: + buffered_event.count += 1 # XXX: see ticket #6243 + self.buffer[data['_group']] = buffered_event + return False + # Event not seen before - add new event to group + LOGGER.debug("A new group '%s' for '%s' source began to be aggregated, " + "first event is being generated.", data['_group'], data['source']) + self.groups[data['_group']] = HiFreqEventData(data) # XXX: see ticket #6243 + self.update_time(parse_iso_datetime_to_utc(data['time'])) + return True + + if (event_time > event.until + datetime.timedelta(hours=AGGREGATE_WAIT) or + event_time.date() > self.time.date()): + LOGGER.debug("A suppressed event is generated for the '%s' group of " + "'%s' source due to passing of %s hours between events.", + data['_group'], data['source'], AGGREGATE_WAIT) + # 24 hour aggregation or AGGREGATE_WAIT time passed between events in group + del self.groups[data['_group']] + self.groups[data['_group']] = HiFreqEventData(data) # XXX: see ticket #6243 + self.buffer[data['_group']] = event + self.update_time(parse_iso_datetime_to_utc(data['time'])) + return True + + # Event for existing group and still aggregating + LOGGER.debug("Event is being aggregated in the '%s' group of the '%s' source.", + data['_group'], data['source']) + event.count += 1 # XXX: see ticket #6243 + if event_time > event.until: + event.until = event_time + del self.groups[data['_group']] + self.groups[data['_group']] = event + self.update_time(parse_iso_datetime_to_utc(data['time'])) + return False + + def generate_suppressed_events(self): + cutoff_time = self.time - datetime.timedelta(hours=AGGREGATE_WAIT) + cutoff_check_complete = False + for_cleanup = [] + for k, v in self.groups.items(): + if v.until >= cutoff_time: + cutoff_check_complete = True + if cutoff_check_complete and v.until.date() == self.time.date(): + break + for_cleanup.append(k) + self.buffer[k] = v + # yield 'suppressed', v.to_dict() if v.count > 1 else None + for k in for_cleanup: + del self.groups[k] + + # generate suppressed events from buffer + cutoff_time = self.time - self.time_tolerance + for_cleanup = [] + for k, v in self.buffer.items(): + if v.until >= cutoff_time: + break + for_cleanup.append(k) + # XXX: see ticket #6243 (check whether here is OK or also will need to be changed) + yield 'suppressed', v.to_dict() if v.count > 1 else None + for k in for_cleanup: + del self.buffer[k] + + def generate_suppressed_events_after_inactive(self): + for _, v in self.buffer.items(): + # XXX: see ticket #6243 (check whether here is OK or also will need to be changed) + yield 'suppressed', v.to_dict() if v.count > 1 else None + for _, v in self.groups.items(): + # XXX: see ticket #6243 (check whether here is OK or also will need to be changed) + yield 'suppressed', v.to_dict() if v.count > 1 else None + self.groups.clear() + self.buffer.clear() + self.last_event = datetime.datetime.utcnow() + + def __repr__(self): + return repr(self.groups) + + +class AggregatorData: + + def __init__(self): + self.sources = {} + + def get_or_create_sourcedata(self, event, time_tolerance): + source = event['source'] + sd = self.sources.get(source) + if sd is None: + sd = SourceData(time_tolerance) + self.sources[source] = sd + return sd + + def get_sourcedata(self, event): + # event['source'] exists because it was created in + # `Aggregator.process_event()` where `process_new_message(data)` + # is run before `generate_suppresed_events_for_source(data)`. + return self.sources[event['source']] + + def __repr__(self): + return repr(self.sources) + + +class AggregatorDataWrapper: + + def __init__(self, dbpath, time_tolerance, time_tolerance_per_source): + self.aggr_data = None + self.dbpath = dbpath + self.time_tolerance = time_tolerance + self.time_tolerance_per_source = time_tolerance_per_source + try: + self.restore_state() + except: + LOGGER.error('Error restoring state from: %a', self.dbpath) + self.aggr_data = AggregatorData() + + def store_state(self): + try: + with open_file(self.dbpath, 'wb') as f: + pickle.dump(self.aggr_data, f, protocol=2) + except IOError: + LOGGER.error('Error saving state to: %a', self.dbpath) + + def restore_state(self): + with open_file(self.dbpath, 'rb') as f: + self.aggr_data = pickle.load(f) + + def process_new_message(self, data): + """ + Processes a message and validates agains db to detect suppressed + event. + Adds new entry to db if necessary (new) or updates entry. + + Returns: + True: when first event in the group received + (i.e. should not be suppressed) + False: when next event in group received + (i.e. should be suppressed and count updated) + """ + + source_data = self.aggr_data.get_or_create_sourcedata( + data, + self.time_tolerance_per_source.get(data['source']) or self.time_tolerance, + ) + result = source_data.process_event(data) + return result + + def generate_suppresed_events_for_source(self, data): + """ + Called after each event in a given source was processed. + Yields suppressed events. + """ + source_data = self.aggr_data.get_sourcedata(data) + for event in source_data.generate_suppressed_events(): + yield event + + def generate_suppresed_events_after_timeout(self): + """Scans all stored sources and based on real time + (i.e. source has been inactive for defined time) + generates suppressed events for inactive sources + """ + LOGGER.debug('Detecting inactive sources after tick timout') + time_now = datetime.datetime.utcnow() + for source in self.aggr_data.sources.values(): + LOGGER.debug('Checking source: %a', source) + if source.last_event + datetime.timedelta(hours=SOURCE_INACTIVITY_TIMEOUT) < time_now: + LOGGER.debug('Source inactive. Generating suppressed events') + for type_, event in source.generate_suppressed_events_after_inactive(): + LOGGER.debug('%a: %a', type_, event) + yield type_, event + + +class Aggregator(ConfigMixin, LegacyQueuedBase): + + input_queue = { + 'exchange': 'event', + 'exchange_type': 'topic', + 'queue_name': 'aggregator', + 'accepted_event_types': [ + 'hifreq', + ], + } + output_queue = { + 'exchange': 'event', + 'exchange_type': 'topic', + } + + config_spec = ''' + [aggregator] + dbpath + time_tolerance :: int + time_tolerance_per_source = {} :: json + ''' + + def __init__(self, **kwargs): + self.aggregator_config = self.get_config_section() + dbpath_dirname = os.path.dirname(self.aggregator_config['dbpath']) + try: + os.makedirs(dbpath_dirname, 0o700) + except OSError: + pass + super(Aggregator, self).__init__(**kwargs) + # store dir doesn't exist, stop aggregator + if not os.path.isdir(dbpath_dirname): + raise Exception('store dir does not exist, stop aggregator, path:', + self.aggregator_config['dbpath']) + # store directory exists, but it has no rights to write + if not os.access(dbpath_dirname, os.W_OK): + raise Exception('stop aggregator, remember to set the rights' + ' for user, which runs aggregator, path:', + self.aggregator_config['dbpath']) + self.db = AggregatorDataWrapper(self.aggregator_config['dbpath'], + self.aggregator_config['time_tolerance'], + self.aggregator_config['time_tolerance_per_source']) + self.timeout_id = None # id of the 'tick' timeout that executes source cleanup + + def start_publishing(self): + """ + Called on startup. + Processes data from db and generates new timeouts for remaining + entries. + """ + self.set_timeout() + + def on_timeout(self): + """ + Callback called periodically after given timeout. + """ + LOGGER.debug('Tick passed') + for type_, event in self.db.generate_suppresed_events_after_timeout(): + if event is not None: + self.publish_event((type_, event)) + self.set_timeout() + + def process_event(self, data): + """ + Processes the event aggregation. + Each event also triggers additional suppressed events based + on time of the given source. + """ + do_publish_new_message = self.db.process_new_message(data) + if do_publish_new_message: + self.publish_event(('event', data)) + for type_, event in self.db.generate_suppresed_events_for_source(data): + if event is not None: + self.publish_event((type_, event)) + + # XXX: can be removed after resolving ticket #6324 + @staticmethod + def _clean_count_related_stuff(cleaned_payload): + count_max = RecordDict.data_spec.count.max_value + count = cleaned_payload.get('count', 1) + if count > count_max: + cleaned_payload['count_actual'] = count + cleaned_payload['count'] = count_max + + def _get_cleaned_payload(self, type_, payload): + cleaned_payload = payload.copy() + cleaned_payload['type'] = type_ + cleaned_payload.pop('_group', None) + self._clean_count_related_stuff(cleaned_payload) + return cleaned_payload + + def publish_event(self, data): + """Publishes event to the output queue""" + type_, payload = data + if type_ is None: + return + cleaned_payload = self._get_cleaned_payload(type_, payload) + source, channel = cleaned_payload['source'].split('.') + rk = "{}.{}.{}.{}".format(type_, "aggregated", source, channel) + body = json.dumps(cleaned_payload) + self.publish_output(routing_key=rk, body=body) + + def set_timeout(self): + LOGGER.debug('Setting tick timeout') + self.timeout_id = self._connection.add_timeout(TICK_TIMEOUT, self.on_timeout) + + def input_callback(self, routing_key, body, properties): + record_dict = RecordDict.from_json(body) + with self.setting_error_event_info(record_dict): + data = dict(record_dict) ## FIXME?: maybe it could be just the record_dict? + if '_group' not in data: + raise n6QueueProcessingException("Hi-frequency source missing '_group' field.") + self.process_event(data) + + def stop(self): + self.db.store_state() + super(Aggregator, self).stop() + + +def main(): + with logging_configured(): + if os.environ.get('n6integration_test'): + # for debugging only + import logging + import sys + LOGGER.setLevel(logging.DEBUG) + LOGGER.addHandler(logging.StreamHandler(stream=sys.__stdout__)) + a = Aggregator() + try: + a.run() + except KeyboardInterrupt: + a.stop() + + +if __name__ == '__main__': + main() diff --git a/N6DataPipeline/n6datapipeline/base.py b/N6DataPipeline/n6datapipeline/base.py index 5d6f431..ad4bf34 100644 --- a/N6DataPipeline/n6datapipeline/base.py +++ b/N6DataPipeline/n6datapipeline/base.py @@ -13,6 +13,11 @@ import re import sys import time +from typing import ( + Optional, + SupportsBytes, + Union, +) import pika import pika.credentials @@ -26,11 +31,13 @@ from n6lib.argument_parser import N6ArgumentParser from n6lib.auth_api import AuthAPICommunicationError from n6lib.common_helpers import ( + as_bytes, ascii_str, exiting_on_exception, make_exc_ascii_str, ) from n6lib.log_helpers import get_logger +from n6lib.typing_helpers import KwargsDict LOGGER = get_logger(__name__) @@ -80,8 +87,8 @@ class LegacyQueuedBase(object): LegacyQueuedBase should handle unexpected interactions with RabbitMQ such as channel and connection closures. - Dev note: if child classes are defining __init__(), it should accept - **kwargs and call super(ChildClass, self).__init__(**kwargs) + Dev note: if a child class defines __init__(), it should accept + **kwargs and call super().__init__(**kwargs) """ # @@ -166,7 +173,7 @@ def __new__(cls, **kwargs): # some unit tests are over-zealous about patching super() from builtins import super - self = super(LegacyQueuedBase, cls).__new__(cls, **kwargs) + self = super().__new__(cls, **kwargs) if cls.input_queue is not None and not isinstance(self.input_queue, dict): raise TypeError('The `input_queue` class attribute must be a dict or None') @@ -345,7 +352,7 @@ def add_suffix_to_queue_conf(queue_conf, suffix): # Actual initialization def __init__(self, **kwargs): - super(LegacyQueuedBase, self).__init__(**kwargs) + super().__init__(**kwargs) LOGGER.debug('input_queue: %a', self.input_queue) LOGGER.debug('output_queue: %a', self.output_queue) @@ -1069,7 +1076,11 @@ def on_message(self, channel, basic_deliver, properties, body): `properties`: A pika.Spec.BasicProperties object. `body`: The message body. """ - exc_info = None + # Note: here we coerce `body` to bytes *just in case*; generally, + # that coercion should not be necessary, as we expect that `pika` + # passes in a `bytes` object. + body = as_bytes(body, encode_error_handling='strict') + delivery_tag = basic_deliver.delivery_tag routing_key = basic_deliver.routing_key if not self._is_output_ready_or_none(): @@ -1115,7 +1126,10 @@ def on_message(self, channel, basic_deliver, properties, body): else: self.acknowledge_message(delivery_tag) - def input_callback(self, routing_key, body, properties): + def input_callback(self, + routing_key: str, + body: bytes, + properties: pika.BasicProperties) -> None: """ Placeholder for input_callback defined by child classes. @@ -1242,7 +1256,11 @@ def start_publishing(self): are ready. Publishers should override this method. """ - def publish_output(self, routing_key, body, prop_kwargs=None, exchange=None): + def publish_output(self, + routing_key: str, + body: Union[str, bytes, bytearray, memoryview, SupportsBytes], + prop_kwargs: Optional[KwargsDict] = None, + exchange: Optional[str] = None): """ Publish to the (default or specified) output exchange. @@ -1250,7 +1268,12 @@ def publish_output(self, routing_key, body, prop_kwargs=None, exchange=None): `routing_key`: The routing key to send the message with. `body`: - The body of the message. + The body of the message, typically a `bytes`/`bytearray` + instance, which can contain arbitrary binary data + (it can also be a `str`, but then: (1) it will be + automatically encoded to `bytes` using UTF-8; (2) it + must *not* contain Unicode surrogate code points, or + `UnicodeEncodeError` will be raised). `prop_kwargs` (optional): Custom keyword arguments for pika.BasicProperties. `exchange` (optional): @@ -1258,6 +1281,8 @@ def publish_output(self, routing_key, body, prop_kwargs=None, exchange=None): the first item of the `output_queue` instance attribute will be used. """ + body = as_bytes(body, encode_error_handling='strict') + if self._closing: # CRITICAL because for a long time (since 2013-04-26!) there was a silent return here! LOGGER.critical('Trying to publish when the `_closing` flag is true!') @@ -1296,7 +1321,11 @@ def publish_output(self, routing_key, body, prop_kwargs=None, exchange=None): '(routing key: {2!a}, body length: {3})'.format( self._closing, self.output_ready, routing_key, len(body))) - def basic_publish(self, exchange, routing_key, body, properties): + def basic_publish(self, + exchange: str, + routing_key: str, + body: bytes, + properties: pika.BasicProperties): """ Thin wrapper around pika's basic_publish -- for easier testing/mocking. diff --git a/N6DataPipeline/n6datapipeline/data/conf/00_global.conf b/N6DataPipeline/n6datapipeline/data/conf/00_global.conf new file mode 100644 index 0000000..5de931d --- /dev/null +++ b/N6DataPipeline/n6datapipeline/data/conf/00_global.conf @@ -0,0 +1,16 @@ +[rabbitmq] +host=localhost +# `url` is a deprecated (and generally not used) legacy alias for `host` +url=%(host)s +port=5671 + +# if you want to use SSL, the `ssl` option must be set to 1 and the +# following options must be set to appropriate file paths: +ssl=0 +path_to_cert=~/cert +ssl_ca_certs=%(path_to_cert)s/testca/cacert.pem +ssl_certfile=%(path_to_cert)s/client/cert.pem +ssl_keyfile=%(path_to_cert)s/client/key.pem + +# AMQP heartbeat interval for most of the components +heartbeat_interval=30 diff --git a/N6DataPipeline/n6datapipeline/data/conf/09_auth_db.conf b/N6DataPipeline/n6datapipeline/data/conf/09_auth_db.conf new file mode 100644 index 0000000..c74d234 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/data/conf/09_auth_db.conf @@ -0,0 +1,35 @@ +[auth_db] +## connection URL, e.g.: mysql+mysqldb://n6:somepassword@localhost/n6 +## it must start with `mysql+mysqldb:` (or just `mysql:`) because other +## dialects/drivers are not supported +## (see also: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html) +#url = mysql://user:password@host/dbname + +## if you want to use SSL, the following options must be set to +## appropriate file paths: +#ssl_cacert = /some/path/to/CACertificatesFile.pem +#ssl_cert = /some/path/to/ClientCertificateFile.pem +#ssl_key = /some/path/to/private/ClientCertificateKeyFile.pem + + +[auth_db_session_variables] + +## all MySQL variables specified within this section will be set by +## executing "SET SESSION = , ...". +## WARNING: for simplicity, the variable names and values are inserted +## into SQL code "as is", *without* any escaping (we assume we can treat +## configuration files as a *trusted* source of data). + +## (`[auth_db_session_variables].wait_timeout` should be +## greater than `[auth_db_connection_pool].pool_recycle`) +wait_timeout = 7200 + + +[auth_db_connection_pool] + +## (generally, the defaults should be OK in most cases; if you are +## interested in technical details -- see: SQLAlchemy docs...) +pool_recycle = 3600 +pool_timeout = 20 +pool_size = 15 +max_overflow = 12 diff --git a/N6DataPipeline/n6datapipeline/data/conf/11_jinja_rendering.conf b/N6DataPipeline/n6datapipeline/data/conf/11_jinja_rendering.conf new file mode 100644 index 0000000..27b0d17 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/data/conf/11_jinja_rendering.conf @@ -0,0 +1,59 @@ +[jinja_template_based_renderer] +############################################################################ +# This configuration section is needed only if the `from_predefined()` # +# constructor provided by `n6lib.jinja_helpers.JinjaTemplateBasedRenderer` # +# is used (note: this is also the case when `MailNoticesAPI` from the # +# `n6lib.mail_notices_api` module and/or `MailMessageBuilder` from the # +# `n6lib.mail_sending_api` module are in use). Other constructors provided # +# by `JinjaTemplateBasedRenderer` do not need any configuration at all. # +############################################################################ + +# The value of the following option should consist of (one or +# more) comma-separated template locations that will be tried, +# in the specified order, by Jinja template loaders when +# searching for templates. +# +# Each of these locations should be: +# +# * An *absolute* path of a directory (aka folder); if it makes +# use of a tilde-based home directory placeholder prefix, such +# as `~` or `~username`, the placeholder will be automatically +# expanded. +# Examples: +# /etc/n6/templates +# ~/my-own-n6-stuff/jinja-related +# ~dataman/.n6/our-custom-fancy-templates +# +# *OR* +# +# * A specification in the following format: +# @: +# where: +# * is a Python package name +# (see also: the docs of the `jinja2.PackageLoader`'s +# parameter `package_name`); +# * is a *relative* path of +# a directory (folder) in that package's source tree +# (see also: the docs of the `jinja2.PackageLoader`'s +# parameter `package_path`). +# Examples: +# @n6lib:data/templates +# @my.own.package:some-dir/sub-dir/sub-sub-dir +template_locations = ~/.n6/templates, @n6datapipeline:data/templates, @n6lib:data/templates + +# The default value ("utf-8") of the following option, should be +# OK in nearly all cases. +;template_encoding = utf-8 + +# The following option is relevant *only* to template locations +# specified as absolute paths of directories (*not* to those in +# the `@:` format). +;follow_symlinks = False + +# The value of the following option should consist of (zero or +# more) comma-separated *import names* of Jinja extensions (see: +# https://jinja.palletsprojects.com/extensions/). Typically, it +# should contain, at the minimum, the "jinja2.ext.do" name -- at +# least, as long as any of the default templates (those bundled +# with *n6*) are in use. +;jinja_extensions = jinja2.ext.do diff --git a/N6DataPipeline/n6datapipeline/data/conf/11_mailing.conf b/N6DataPipeline/n6datapipeline/data/conf/11_mailing.conf new file mode 100644 index 0000000..9898769 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/data/conf/11_mailing.conf @@ -0,0 +1,242 @@ +# Note: the *mail notices* feature engages the 3 configuration sections: +# +# * the `[mail_notices_api]` section (see below) +# -- directly related to `MailNoticesAPI` from `n6lib.mail_notices_api`, +# +# * the `[mail_sending_api]` section (see below) +# -- directly related to `MailSendingAPI` from `n6lib.mail_sending_api`, +# +# * the `[jinja_template_based_renderer]` section +# (see a separate file; typically it is `11_jinja_rendering.conf`) +# -- directly related to `JinjaTemplateBasedRenderer.from_predefined()` +# from `n6lib.jinja_helpers`. +# +# The `MailSendingAPI` and/or `JinjaTemplateBasedRenderer` tools, though +# somewhat lower-level ones, can also be used on their own (then only +# the section directly related to the particular tool is relevant). + + + + +[mail_notices_api] + +# Should mail notices be dispatched at all? If this option is +# false then any invocations of a dispatcher obtained from a +# context manager returned by the `MailNoticesAPI.dispatcher()` +# method do nothing, and *no* other options from this section or +# from the `[mail_sending_api]`/`[jinja_template_based_renderer]` +# sections (which normally are also engaged) are used by the +# `MailNoticesAPI` stuff. +active = false + +# The value of the following option, if not being empty, should +# be a Python dict literal representing a dict that maps *notice +# keys* (str, e.g.: 'org_config_update_requested') to dicts that +# map 2-character codes of a supported *language* (such as 'EN' +# or 'PL) to dicts specifying the following mail components: +# *body*, *subject*, *sender* and (optionally) *misc headers* +# (which stands for *miscellaneous mail headers*). +# +# Lack of a certain *notice key* means that the mail notices +# stuff is not active for that *notice key* (meaning that any +# invocations of a dispatcher obtained from a context manager +# returned by any `MailNoticesAPI.dispatcher()` +# call do nothing). +# +# Each of the *mail components* dicts (i.e., the dicts mentioned +# above as those specifying mail components) contains some or +# all of the following items: +# +# * 'body' -- a *string value* (required), +# +# * 'subject' -- a *string value* (required), +# +# * 'sender' -- a *string value* (required if the value of +# the `default_sender` option [see below] is left empty, +# otherwise optional), +# +# * 'misc_headers' -- a dict that maps any mail header names +# to their values, specified as *string values* (optional); +# +# **Important note:** each of the *string values* mentioned +# above shall be a string which is: +# +# * (1) **either** a Jinja template name preceded with a `$:` +# (*dollar sign* followed by *colon*) marker, +# +# * (2) **or** any other string -- which *literally* specifies +# the item's value (**without** any HTML/XML escaping!). +# +# Ad (1): those Jinja templates will be used by an instance of +# `JinjaTemplateBasedRenderer` (see `n6lib.jinja_helpers` and +# the `[jinja_template_based_renderer]` config section) as the +# basis for rendering of actual values -- with the *rendering +# context* containing the `data_dict` variable being a deep copy +# of the `notice_data` dict passed in to the dispatcher [where +# *dispatcher* is a callable object obtained as the `as` target +# (`__enter__()`'s return value) of a context manager returned +# by `MailNoticesAPI.dispatcher()`]. +# +# **Beware** that HTML/XML escaping is applied **only** if the +# template name has a `.html`, `.htm` or `.xml` suffix (checked +# in a case-insensitive manner). +# +# For example templates -- see the template files in the +# `data/templates` subdirectory of the `n6lib` package source +# tree. +# +# The default value of this option seems to be quite sensible +# for most important use cases. The basic versions of the +# Jinja templates it refers to are already defined in the +# `data/templates` subdirectory of the `n6lib` package; note: +# you can customize them by creating your own template files -- +# named the same but placed in (an)other location(s) (specified +# with the `template_locations` configuration option in the +# section `[jinja_template_based_renderer]`). +;notice_key_to_lang_to_mail_components = +; { +; 'mfa_config_done': { +; 'EN': { +; 'subject': +; 'New configuration of multi-factor authentication', +; 'body': '$:mail_notice__mfa_config_done__EN.txt', +; }, +; 'PL': { +; 'subject': +; u'Nowa konfiguracja uwierzytelniania wielosk\u0142adnikowego', +; 'body': '$:mail_notice__mfa_config_done__PL.txt', +; }, +; }, +; 'mfa_config_erased': { +; 'EN': { +; 'subject': +; 'Deleted configuration of multi-factor authentication', +; 'body': '$:mail_notice__mfa_config_erased__EN.txt', +; }, +; 'PL': { +; 'subject': +; u'Usuni\u0119ta konfiguracja uwierzytelniania wielosk\u0142adnikowego', +; 'body': '$:mail_notice__mfa_config_erased__PL.txt', +; }, +; }, +; +; 'new_org_and_user_created': { +; 'EN': { +; 'subject': +; 'Welcome to the n6 system', +; 'body': '$:mail_notice__new_org_and_user_created__EN.txt', +; }, +; 'PL': { +; 'subject': +; u'Witamy w systemie n6', +; 'body': '$:mail_notice__new_org_and_user_created__PL.txt', +; }, +; }, +; +; 'org_config_update_requested': { +; 'EN': { +; 'subject': +; 'A new request to update the organization configuration', +; 'body': '$:mail_notice__org_config_update_requested__EN.txt', +; }, +; 'PL': { +; 'subject': +; 'Nowa propozycja zmian w konfiguracji Twojej organizacji', +; 'body': '$:mail_notice__org_config_update_requested__PL.txt', +; }, +; }, +; 'org_config_update_applied': { +; 'EN': { +; 'subject': +; 'Acceptance of the requested update of the organization configuration', +; 'body': '$:mail_notice__org_config_update_applied__EN.txt', +; }, +; 'PL': { +; 'subject': +; 'Akceptacja zmian w konfiguracji Twojej organizacji', +; 'body': '$:mail_notice__org_config_update_applied__PL.txt', +; }, +; }, +; 'org_config_update_rejected': { +; 'EN': { +; 'subject': +; 'Rejection of the requested update of the organization configuration', +; 'body': '$:mail_notice__org_config_update_rejected__EN.txt', +; }, +; 'PL': { +; 'subject': +; 'Odmowa wprowadzenia zmian w konfiguracji Twojej organizacji', +; 'body': '$:mail_notice__org_config_update_rejected__PL.txt', +; }, +; }, +; +; 'password_reset_done': { +; 'EN': { +; 'subject': +; 'New log-in password', +; 'body': '$:mail_notice__password_reset_done__EN.txt', +; }, +; 'PL': { +; 'subject': +; u'Nowe has\u0142o logowania', +; 'body': '$:mail_notice__password_reset_done__PL.txt', +; }, +; }, +; 'password_reset_requested': { +; 'EN': { +; 'subject': +; 'Setting new log-in password', +; 'body': '$:mail_notice__password_reset_requested__EN.txt', +; }, +; 'PL': { +; 'subject': +; u'Ustawianie nowego has\u0142a logowania', +; 'body': '$:mail_notice__password_reset_requested__PL.txt', +; }, +; }, +; } + +# The following option specifies (using a 2-character string) +# the *default language* -- to be used when *neither* of the +# `MailNoticesAPI.dispatcher()` and `()` +# invocations has included the `lang` argument (specifying the +# desired mail notice language variant); but also when it has +# been included but its value is missing from the *notice key*- +# specific subdict of the `notice_key_to_lang_to_mail_components` +# dict (see its description above). +;default_lang = EN + +# The value of the following option, if not left empty, should +# be a text to be used as the default value of the 'sender' +# item of subdicts that define mail components (see the above +# description of the `notice_key_to_lang_to_mail_components` +# option; the remarks about `$:`-prepended *template names* +# and HTML/XML escaping apply also here). +default_sender = n6notices@example.org + +# The value of the following option, if not left empty, should +# be a Python dict literal that defines additional mail headers, +# to be used to complement (but never overwrite) the items of +# each 'misc_headers' dict (ad 'misc_headers` -- see the above +# description of the `notice_key_to_lang_to_mail_components` +# option; the remarks about `$:`-prepended *template names* and +# HTML/XML escaping apply also here). +;common_misc_headers = + + + + +[mail_sending_api] + +smtp_host=localhost +smtp_port=25 +;smtp_login= +;smtp_password= + + + + +# Note: if you make use of `n6lib.mail_notices_api.MailNoticesAPI` +# and/or `n6lib.mail_sending_api.MailMessageBuilder`, you must also pay +# attention to the `[jinja_template_based_renderer]` configuration +# section (typically, placed in the `11_jinja_rendering.conf` file). diff --git a/N6DataPipeline/n6datapipeline/data/templates/.gitkeep b/N6DataPipeline/n6datapipeline/data/templates/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/N6DataPipeline/n6datapipeline/enrich.py b/N6DataPipeline/n6datapipeline/enrich.py new file mode 100644 index 0000000..0b91d35 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/enrich.py @@ -0,0 +1,298 @@ +# Copyright (c) 2013-2021 NASK. All rights reserved. + +import collections +import os +import urllib.parse + +import dns.resolver +# TODO: After migration to Pyton 3.x: remove the `iptools` dependency, +# adjusting our code to use std lib's `ipaddress` (maybe also +# adding IPv4/v6/both-dedicated config converters?), and/or our +# own existing IP-address-related helpers... +import iptools +import maxminddb.const +from dns.exception import DNSException +from geoip2 import database, errors + +from n6datapipeline.base import LegacyQueuedBase +from n6lib.common_helpers import replace_segment, is_ipv4 +from n6lib.config import ConfigMixin +from n6lib.log_helpers import get_logger, logging_configured +from n6lib.record_dict import RecordDict + + +LOGGER = get_logger(__name__) + + +class Enricher(ConfigMixin, LegacyQueuedBase): + + input_queue = { + 'exchange': 'event', + 'exchange_type': 'topic', + 'queue_name': 'enrichement', + 'accepted_event_types': [ + 'event', + 'bl', + 'bl-update', + 'suppressed', + ], + } + output_queue = { + 'exchange': 'event', + 'exchange_type': 'topic', + } + + config_spec = """ + [enrich] + dnshost + dnsport :: int + geoippath = "" + asndatabasefilename = "" + citydatabasefilename = "" + excluded_ips = "" :: list_of_str + """ + + single_instance = False + + # + # Initialization + + def __init__(self, **kwargs): + self.is_geodb_enabled = False + self.gi_asn = None + self.gi_cc = None + self._resolver = None + self._enrich_config = self.get_config_section() + self.excluded_ips = self._get_excluded_ips() + self._setup_geodb() + self._setup_dnsresolver(self._enrich_config["dnshost"], self._enrich_config["dnsport"]) + super(Enricher, self).__init__(**kwargs) + + def _get_excluded_ips(self): + if self._enrich_config['excluded_ips']: + return iptools.IpRangeList(*self._enrich_config['excluded_ips']) + return None + + def _setup_dnsresolver(self, dnshost, dnsport): + self._resolver = dns.resolver.Resolver(configure=False) + self._resolver.nameservers = [dnshost] + self._resolver.port = dnsport + + def _setup_geodb(self): + geoipdb_path = self._enrich_config["geoippath"] + if geoipdb_path: + geoipdb_asn_file = self._enrich_config["asndatabasefilename"] + geoipdb_city_file = self._enrich_config["citydatabasefilename"] + if geoipdb_asn_file: + self.gi_asn = database.Reader(fileish=os.path.join(geoipdb_path, geoipdb_asn_file), + mode=maxminddb.const.MODE_MEMORY) + self.is_geodb_enabled = True + if geoipdb_city_file: + self.gi_cc = database.Reader(fileish=os.path.join(geoipdb_path, geoipdb_city_file), + mode=maxminddb.const.MODE_MEMORY) + self.is_geodb_enabled = True + + # + # Main activity + + def input_callback(self, routing_key, body, properties): + data = RecordDict.from_json(body) + with self.setting_error_event_info(data): + enriched = self.enrich(data) + rk = replace_segment(routing_key, 1, 'enriched') + body = enriched.get_ready_json() + self.publish_output(routing_key=rk, body=body) + + def enrich(self, data): + enriched_keys = [] + ip_to_enriched_address_keys = collections.defaultdict(list) + ip_from_url, fqdn_from_url = self._extract_ip_or_fqdn(data) + self._maybe_set_fqdn(fqdn_from_url, data, enriched_keys) + self._maybe_set_address_ips(ip_from_url, data, ip_to_enriched_address_keys) + if data.get('address'): + self._filter_out_excluded_ips(data, ip_to_enriched_address_keys) + self._maybe_set_other_address_data(data, ip_to_enriched_address_keys) + # NOTE: the `enriched` item of the record dict is set here to + # the pair (2-tuple) whose elements are: + # 0) a list of keys added by Enricher to the record dict + # (for now, the only such key is "fqdn"), + # 1) a dict whose keys are IP addresses (strings) and values + # are lists of address item keys added by Enricher for a + # particular IP ("asn", "cc", "ip") + # -- for example: + # (["fqdn"], {"127.0.0.1": ["ip"], "1.2.3.4": ["asn", "cc", "ip"]}) + data['enriched'] = (enriched_keys, ip_to_enriched_address_keys) + self._ensure_address_is_clean(data) + self._final_sanity_assertions(data) # <- can be commented out for efficiency + return data + + def _extract_ip_or_fqdn(self, data): + ip_from_url = fqdn_from_url = None + url = data.get('url') + if url is not None: + _fqdn_or_ip = self.url_to_fqdn_or_ip(url) + # ^ note: the returned _fqdn_or_ip *can* be an empty string + ## but it should not be None; added the following condition for debug + if _fqdn_or_ip is None: + LOGGER.error( + '_fqdn_or_ip is None, source: %a, url: %a', + data['source'], + url) + if is_ipv4(_fqdn_or_ip): + ip_from_url = _fqdn_or_ip + elif _fqdn_or_ip: + fqdn_from_url = _fqdn_or_ip + return ip_from_url, fqdn_from_url + + def _maybe_set_fqdn(self, fqdn_from_url, data, enriched_keys): + if data.get('fqdn') is None and fqdn_from_url: + data['fqdn'] = fqdn_from_url + enriched_keys.append('fqdn') + + def _maybe_set_address_ips(self, ip_from_url, data, ip_to_enriched_address_keys): + if not data.get('address'): + if data.get('fqdn') is None: + if ip_from_url: + data['address'] = [{'ip': ip_from_url}] + ip_to_enriched_address_keys[ip_from_url].append('ip') + elif not data.get('_do_not_resolve_fqdn_to_ip'): + _address = [] + for ip in self.fqdn_to_ip(data.get('fqdn')): + _address.append({'ip': ip}) + ip_to_enriched_address_keys[ip].append('ip') + if _address: + data['address'] = _address + + def _filter_out_excluded_ips(self, data, ip_to_enriched_address_keys): + assert 'address' in data + if self.excluded_ips: + _address = [] + for addr in data['address']: + ip = addr['ip'] + if ip in self.excluded_ips: + ip_to_enriched_address_keys.pop(ip, None) + else: + _address.append(addr) + data['address'] = _address + + def _maybe_set_other_address_data(self, data, ip_to_enriched_address_keys): + if self.is_geodb_enabled: + assert 'address' in data + for addr in data['address']: + # ASN + self._maybe_set_asn(addr, data, ip_to_enriched_address_keys) + # CC + self._maybe_set_cc(addr, data, ip_to_enriched_address_keys) + + def _maybe_set_asn(self, addr, data, ip_to_enriched_address_keys): + if self.gi_asn is not None: + ip = addr['ip'] + existing_asn = addr.pop('asn', None) + if existing_asn is not None: + LOGGER.warning( + 'it should not happen: event\'s `address` ' + 'contained an `asn` (%a) *before* enrichment ' + '-- so the `asn` has been dropped! ' + '[ip: %s; source: %a; event id: %a; rid: %a]', + existing_asn, + ip, + data['source'], + data['id'], + data['rid']) + asn = self.ip_to_asn(ip) + if asn: + addr['asn'] = asn + ip_to_enriched_address_keys[ip].append('asn') + + def _maybe_set_cc(self, addr, data, ip_to_enriched_address_keys): + if self.gi_cc is not None: + ip = addr['ip'] + existing_cc = addr.pop('cc', None) + if existing_cc is not None: + LOGGER.warning( + 'it should not happen: event\'s `address` ' + 'contained a `cc` (%a) *before* enrichment ' + '-- so the `cc` has been dropped! ' + '[ip: %s; source: %a; event id: %a; rid: %a]', + existing_cc, + ip, + data['source'], + data['id'], + data['rid']) + cc = self.ip_to_cc(ip) + if cc: + addr['cc'] = cc + ip_to_enriched_address_keys[ip].append('cc') + + def _ensure_address_is_clean(self, data): + if data.get('address'): + # ensure that all content is normalized + # by RecordDict's `address` adjuster + data['address'] = data['address'] + else: + # ensure that no empty address is left + data.pop('address', None) + + def _final_sanity_assertions(self, data): + if __debug__: + enriched_keys, ip_to_enriched_address_keys = data['enriched'] + ip_to_addr = { + addr['ip']: addr + for addr in data.get('address', ())} + assert all( + name in data + for name in enriched_keys) + assert all( + set(addr_keys).issubset(ip_to_addr[ip]) + for ip, addr_keys in list(ip_to_enriched_address_keys.items())) + + # + # Resolution helpers + + def url_to_fqdn_or_ip(self, url): + parsed_url = urllib.parse.urlparse(url) + if parsed_url.netloc.endswith(':'): + # URL is probably wrong -- something like: "http://http://..." + return '' + return parsed_url.hostname + + def fqdn_to_ip(self, fqdn): + try: + dns_result = self._resolver.query(fqdn, 'A') + except DNSException: + return [] + ip_set = set() + for i in dns_result: + ip_set.add(str(i)) + return sorted(ip_set) + + def ip_to_asn(self, ip): + assert self.gi_asn is not None + try: + geoip_asn = self.gi_asn.asn(ip) + except errors.GeoIP2Error: + LOGGER.info("%a cannot be resolved by GeoIP (to ASN)", ip) + return None + return geoip_asn.autonomous_system_number + + def ip_to_cc(self, ip): + assert self.gi_cc is not None + try: + geoip_city = self.gi_cc.city(ip) + except errors.GeoIP2Error: + LOGGER.info("%a cannot be resolved by GeoIP (to CC)", ip) + return None + return geoip_city.country.iso_code + + +def main(): + with logging_configured(): + enricher = Enricher() + try: + enricher.run() + except KeyboardInterrupt: + enricher.stop() + + +if __name__ == "__main__": + main() diff --git a/N6DataPipeline/n6datapipeline/intelmq/helpers.py b/N6DataPipeline/n6datapipeline/intelmq/helpers.py index 862e698..b6f2820 100644 --- a/N6DataPipeline/n6datapipeline/intelmq/helpers.py +++ b/N6DataPipeline/n6datapipeline/intelmq/helpers.py @@ -6,6 +6,7 @@ from logging import getLogger from typing import Optional +import pika from intelmq.lib.bot import CollectorBot from intelmq.lib.message import MessageFactory @@ -178,7 +179,10 @@ def make_binding_keys(self, binding_states, accepted_event_types): super(QueuedBaseExtended, self).make_binding_keys(binding_states, accepted_event_types) - def input_callback(self, routing_key, body, properties): + def input_callback(self, + routing_key: str, + body: bytes, + properties: pika.BasicProperties) -> None: self.current_routing_key = routing_key input_msg = body.decode('utf-8') self.current_message = input_msg diff --git a/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_adapter.py b/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_adapter.py index 97dc844..2711b3a 100644 --- a/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_adapter.py +++ b/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_adapter.py @@ -9,6 +9,8 @@ from logging import getLogger from typing import Type +import pika + from n6datapipeline.base import LegacyQueuedBase from n6datapipeline.intelmq.utils.intelmq_converter import ( IntelToN6Converter, @@ -49,7 +51,10 @@ def set_queue_name(self): def get_component_group_and_id(self): return 'intelmq-utils', self.components_id - def input_callback(self, routing_key, body, properties): + def input_callback(self, + routing_key: str, + body: bytes, + properties: pika.BasicProperties) -> None: for converted in self.converter.convert(body.decode()): rk = replace_segment(routing_key, 1, self.components_id) self.publish_output(routing_key=rk, body=converted) diff --git a/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_converter.py b/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_converter.py index 9ef4d48..3de19ea 100644 --- a/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_converter.py +++ b/N6DataPipeline/n6datapipeline/intelmq/utils/intelmq_converter.py @@ -23,7 +23,7 @@ import bisect import json import logging -from collections import MutableMapping +from collections.abc import MutableMapping from copy import deepcopy diff --git a/N6DataPipeline/n6datapipeline/tests/test_aggregator.py b/N6DataPipeline/n6datapipeline/tests/test_aggregator.py new file mode 100644 index 0000000..6794bd6 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/tests/test_aggregator.py @@ -0,0 +1,1866 @@ +# Copyright (c) 2013-2018 NASK. All rights reserved. + +import datetime +import json +import os +import tempfile +import unittest +from collections import namedtuple + +from unittest.mock import ( + MagicMock, + patch, +) +from unittest_expander import ( + expand, + foreach, + param, + paramseq, +) + +from n6datapipeline.base import n6QueueProcessingException +from n6datapipeline.aggregator import ( + Aggregator, + AggregatorData, + AggregatorDataWrapper, + HiFreqEventData, + SourceData, +) +from n6lib.unit_test_helpers import TestCaseMixin + + + +@expand +class TestAggregator(TestCaseMixin, unittest.TestCase): + + sample_routing_key = "testsource.testchannel" + sample_dbpath = "/tmp/sample_dbfile" + sample_time_tolerance = 600 + sample_time_tolerance_per_source = { + 'anothersource.andchannel': 1200, + } + starting_datetime = datetime.datetime(2017, 6, 1, 10) + mocked_utcnow = datetime.datetime(2017, 7, 1, 7, 0, 0) + input_callback_proper_msg = ( + '{' + '"source": "testsource.testchannel",' + '"_group": "group1",' + '"id": "d41d8cd98f00b204e9800998ecf8427b",' + '"time": "2017-06-01 10:00:00"' + '}' + ) + input_callback_msg_no__group = ( + '{' + '"source": "testsource.testchannel",' + '"id": "d41d8cd98f00b204e9800998ecf8427b",' + '"time": "2017-06-01 10:00:00"' + '}' + ) + mocked_config = { + "aggregator": { + "dbpath": sample_dbpath, + "time_tolerance": str(sample_time_tolerance), + "time_tolerance_per_source": json.dumps(sample_time_tolerance_per_source), + } + } + + + @paramseq + def _ordered_data_to_process(cls): + # Three events are published, each one of a different group. + yield param( + input_data=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": "testsource.testchannel", + "_group": "group1", + "time": str(cls.starting_datetime), + }, + { + "id": "c81e728d9d4c2f636f067f89cc14862c", + "source": "testsource.testchannel", + "_group": "group2", + "time": str(cls.starting_datetime), + }, + { + "id": "eccbc87e4b5ce2fe28308fd9f2a7baf3", + "source": "testsource.testchannel", + "_group": "group3", + "time": str(cls.starting_datetime + datetime.timedelta(hours=1)), + }, + ], + expected_ids_to_single_events=[ + "c4ca4238a0b923820dcc509a6f75849b", + "c81e728d9d4c2f636f067f89cc14862c", + "eccbc87e4b5ce2fe28308fd9f2a7baf3" + ], + ) + + # First events of three groups are published; events + # of "group1" and "group2" are aggregated. The last + # event is published and it triggers publishing of + # aggregated events - its time difference exceeds + # the `AGGREGATE_WAIT` parameter. + yield param( + input_data=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": "testsource.testchannel", + "_group": "group1", + "time": str(cls.starting_datetime), + }, + { + "id": "c81e728d9d4c2f636f067f89cc14862c", + "source": "testsource.testchannel", + "_group": "group2", + "time": str(cls.starting_datetime), + }, + { + "id": "d41d8cd98f00b204e9800998ecf8426f", + "source": "testsource.testchannel", + "_group": "group3", + "time": str(cls.starting_datetime), + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427a", + "source": "testsource.testchannel", + "_group": "group1", + "time": str(cls.starting_datetime + datetime.timedelta(hours=1)), + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group2", + "time": str(cls.starting_datetime + datetime.timedelta(hours=1)), + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": str(cls.starting_datetime + datetime.timedelta(hours=2)), + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": str(cls.starting_datetime + datetime.timedelta(hours=14)), + }, + ], + expected_ids_to_single_events=[ + "c4ca4238a0b923820dcc509a6f75849b", + "c81e728d9d4c2f636f067f89cc14862c", + "d41d8cd98f00b204e9800998ecf8426f", + "d41d8cd98f00b204e9800998ecf8427d" + ], + expected_ids_to_suppressed_events={ + "c4ca4238a0b923820dcc509a6f75849b": { + '_first_time': str(cls.starting_datetime), + # the 'until' value is the time of the + # excluding the event that triggered + # publishing of aggregated events + "until": str(cls.starting_datetime + datetime.timedelta(hours=2)), + # the event that triggered publishing + # of aggregated events is not included + # in the count, it will be published + # with next group of aggregated events + "count": 3, + }, + "c81e728d9d4c2f636f067f89cc14862c": { + "_first_time": str(cls.starting_datetime), + "until": str(cls.starting_datetime + datetime.timedelta(hours=1)), + "count": 2, + }, + }, + ) + + # The latest event is 12 hours older than the last event + # of 'group2', but not than the last event of 'group1'. + # Suppressed events are published only for the 'group2'. + # There is only one event from the 'group3', so no suppressed + # events are published for it. + yield param( + input_data=[ + { + "id": "1", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 08:00:00", + }, + { + "id": "2", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2017-06-01 08:02:00", + }, + { + "id": "3", + "source": "testsource.testchannel", + "_group": "group3", + "time": "2017-06-01 08:04:00", + }, + { + "id": "4", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 09:00:00", + }, + { + "id": "5", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2017-06-01 09:00:20", + }, + { + "id": "6", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "7", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 21:30:00", + }, + ], + expected_ids_to_single_events=[ + "1", + "2", + "3", + ], + expected_ids_to_suppressed_events={ + "2": { + "until": "2017-06-01 09:00:20", + "_first_time": "2017-06-01 08:02:00", + "count": 2, + } + } + ) + + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 01:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 11:00:01', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 12:00:01', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + ], + ).label('The first event is published as "event", next two are aggregated.') + + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 01:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 02:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 14:00:01', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "until": "2017-06-01 02:00:00", + "_first_time": "2017-06-01 01:00:00", + "count": 2, + }, + }, + ).label("Last event is 12 hours older than the previous one, it triggers publishing " + "of suppressed events.") + + # The latest 'group1' event is from the next day, so + # it triggers publishing of suppressed events for the 'group1'. + # There is only one 'group2' event, so there is no suppressed + # event for it. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 18:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 19:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 20:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-02 01:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "d41d8cd98f00b204e9800998ecf8427c", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "until": "2017-06-01 20:00:00", + "_first_time": "2017-06-01 18:00:00", + "count": 2, + } + } + ) + + # The 'group2' latest event is from the next day, comparing + # to previous events, so it triggers publishing of suppressed + # events from the 'group1' and 'group2'. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 18:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 19:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8425c", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 19:28:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 20:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-02 01:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "d41d8cd98f00b204e9800998ecf8427c", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "until": "2017-06-01 20:00:00", + "_first_time": "2017-06-01 18:00:00", + "count": 2, + }, + "d41d8cd98f00b204e9800998ecf8427c": { + "until": "2017-06-01 19:28:00", + "_first_time": "2017-06-01 19:00:00", + "count": 2, + }, + } + ) + + # The third event is older than the current time, but it is + # newer than the first event from its group, so it is still + # aggregated. The last event triggers publishing of suppressed + # events. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 12:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8426d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 11:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-02 11:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "count": 3, + "_first_time": "2017-06-01 10:00:00", + "until": "2017-06-01 12:00:00", + }, + }, + expected_last_event_dt_updates=3, + ) + + # The second event is older than the current time, but it is + # within the time tolerance, so it is still aggregated. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 09:51:00", # time within mocked time tolerance + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-02 11:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "count": 2, + "until": "2017-06-01 10:00:00", + "_first_time": "2017-06-01 10:00:00", + }, + }, + ) + + # The second and fourth event is older than the current time, + # but fits the time tolerance for specific source, so it is + # still aggregated. + yield param( + input_data=[ + { + "id": "1", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "2", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 09:51:00", # within time tolerance + }, + { + "id": "3", + "source": "anothersource.andchannel", + "_group": "group1", + "time": '2017-06-01 11:00:00', + }, + { + "id": "4", + "source": "anothersource.andchannel", + "_group": "group1", + "time": '2017-06-01 10:40:00', # within time tolerance + }, + ], + expected_ids_to_single_events=[ + "1", + "3", + ], + ) + + # The newest event, which triggers publishing of suppressed + # events, has next day's date, but it also has to be + # greater than the time of a checked group's last + # event by more than the `time_tolerance`. Otherwise, + # further publishing of suppressed events is stopped. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 17:00:00", + }, + { + "id": "53b325261706c63aed655a3ca8810780", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 18:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427f", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 19:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 23:57:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-02 00:03:01', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "53b325261706c63aed655a3ca8810780", + "d41d8cd98f00b204e9800998ecf8427e", + ], + ) + + # The newest event has a time greater than the last + # "group1" event by more than the `time_tolerance`, + # so suppressed events are published for the group. + # Then, after checking "group2", the event does not + # meet this condition, so publishing of suppressed + # events is stopped here. No suppressed events + # are published for the "group2". + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 17:00:00", + }, + { + "id": "53b325261706c63aed655a3ca8810780", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 18:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427c", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 19:00:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427f", + "source": "testsource.testchannel", + "_group": "group2", + "time": '2017-06-01 23:57:00', + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-02 00:03:01', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + "53b325261706c63aed655a3ca8810780", + "d41d8cd98f00b204e9800998ecf8427e", + ], + expected_ids_to_suppressed_events={ + "d41d8cd98f00b204e9800998ecf8427b": { + "_first_time": "2017-06-01 17:00:00", + "until": "2017-06-01 19:00:00", + "count": 2, + }, + }, + ) + + + @paramseq + def _unordered_data_to_process(cls): + # The first event is published, second and third are ignored, + # because they are older than the current time (event with + # the time tolerance added) and older than the first event + # of the group. Last event is aggregated. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 08:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8426d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 09:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 11:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + ], + # Number of times the `SourceData` instance's `last_event` + # datetime is expected to be updated. It should not be + # updated, if the event is out of order. + expected_last_event_dt_updates=2, + ) + + # The first event is published. The second is ignored, + # because it is older than the current time, and its group + # has not been registered before. The third one is older + # than the current time, and older than the first event + # of its group. The last one is aggregated. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998ecf8427b", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427d", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2017-06-01 08:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8426d", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2017-06-01 09:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "source": "testsource.testchannel", + "_group": "group1", + "time": '2017-06-01 11:00:02', + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998ecf8427b", + ], + expected_last_event_dt_updates=2, + ) + + @paramseq + def _unordered_data_to_process_event__buffer_may_contain_suppressed_event_1(cls): + # The first, second, and third event are published. The last event is unique. + # The first has a new group. The second with the same group achieved aggregated time. + # The third event has a new group. The last event is older than the time of the source, + # but it fits in the tolerance range. There is not a high-frequency event of 'group1' + # in the groups dict, but it still remains in the buffer. Because of it, the event is + # neither being published nor aggregated, but the count attribute of related high-frequency + # event in the buffer is incremented. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998hg351", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 00:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg352", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 23:51:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg353", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2020-01-02 00:01:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg354", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-02 00:00:00", + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998hg351", + "d41d8cd98f00b204e9800998hg352", + "d41d8cd98f00b204e9800998hg353", + ], + expected_last_event_dt_updates=3, + ) + + @paramseq + def _unordered_data_to_process_event__buffer_may_contain_suppressed_event_2(cls): + # The first and third event are published. The second is aggregated. The last event is + # unique. The first has a new group. The second with the same group fits in the aggregated + # range time. The third event has a new group so published. + # The last event is older than the time of the source, but it fits in the tolerance range. + # There is not a high-frequency event of 'group1' in the groups dict, but it still remains + # in the buffer. Because of it, the event is neither being published nor aggregated, + # but the count attribute of related high-frequency event in the buffer is incremented. + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998hg351", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 22:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg352", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 23:51:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg353", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2020-01-02 00:01:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg354", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-02 00:00:00", + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998hg351", + "d41d8cd98f00b204e9800998hg353", + ], + expected_last_event_dt_updates=3, + ) + + @paramseq + def _unordered_data_to_process_event__buffer_may_contain_suppressed_event_3(cls): + # All events are published. The first has a new group. The second with the same group + # achieved aggregated time. The third event has a new group so published. + # The last event has a new group and is older than the time of the source, but it fits + # in the tolerance range. The difference between the case and other two similar + # cases is that it does not fulfill the condition, that a 'group1' hi-freq + # event still remains in the buffer - the buffer has been cleared, because + # the difference between the source time and 'until' time of the last event + # of 'group1' exceeds the tolerance range. So instead of suppressing the + # last 'group1' event and incrementing the hi-freq event's counter, + # the new event is being published. + + yield param( + input_data=[ + { + "id": "d41d8cd98f00b204e9800998hg351", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 00:00:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg352", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-01 20:51:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg353", + "source": "testsource.testchannel", + "_group": "group2", + "time": "2020-01-02 22:01:00", + }, + { + "id": "d41d8cd98f00b204e9800998hg354", + "source": "testsource.testchannel", + "_group": "group1", + "time": "2020-01-02 22:00:00", + }, + ], + expected_ids_to_single_events=[ + "d41d8cd98f00b204e9800998hg351", + "d41d8cd98f00b204e9800998hg352", + "d41d8cd98f00b204e9800998hg353", + "d41d8cd98f00b204e9800998hg354", + ], + expected_last_event_dt_updates=4, + ) + + def setUp(self): + self._published_events = [] + self._aggregator = Aggregator.__new__(Aggregator) + aggr_data_wrapper = AggregatorDataWrapper.__new__(AggregatorDataWrapper) + aggr_data_wrapper.aggr_data = AggregatorData() + aggr_data_wrapper.time_tolerance = self.sample_time_tolerance + aggr_data_wrapper.time_tolerance_per_source = self.sample_time_tolerance_per_source + self._mocked_datetime_counter = 0 + self._aggregator.db = aggr_data_wrapper + + + @foreach(_ordered_data_to_process + + _unordered_data_to_process_event__buffer_may_contain_suppressed_event_1 + + _unordered_data_to_process_event__buffer_may_contain_suppressed_event_2 + + _unordered_data_to_process_event__buffer_may_contain_suppressed_event_3) + def test_processing_events(self, + input_data, + expected_ids_to_single_events=None, + expected_ids_to_suppressed_events=None, + expected_last_event_dt_updates=None): + + if expected_last_event_dt_updates is None: + expected_last_event_dt_updates = len(input_data) + + self._test_process_event(input_data, + expected_ids_to_single_events, + expected_ids_to_suppressed_events, + expected_last_event_dt_updates) + + + @foreach(_unordered_data_to_process) + def test_processing_unordered_events(self, + input_data, + expected_ids_to_single_events=None, + expected_ids_to_suppressed_events=None, + expected_last_event_dt_updates=None): + + if expected_last_event_dt_updates is None: + expected_last_event_dt_updates = len(input_data) + + with self.assertRaisesRegex(n6QueueProcessingException, r"\bEvent out of order\b"): + self._test_process_event(input_data, + expected_ids_to_single_events, + expected_ids_to_suppressed_events, + expected_last_event_dt_updates) + + @foreach([ + param( + count=32767, + expected_body_content={ + "source": "ham.spam", + "type": "foobar", + "count": 32767, + }, + ).label("count not over limit"), + param( + count=32768, + expected_body_content={ + "source": "ham.spam", + "type": "foobar", + "count": 32767, + "count_actual": 32768, + }, + ).label("count over limit"), + ]) + def test_publish_event(self, count, expected_body_content): + type_ = "foobar" + payload = { + "source": "ham.spam", + "_group": "something", + "count": count, + } + data = type_, payload + expected_routing_key = "foobar.aggregated.ham.spam" + self._aggregator.publish_output = MagicMock() + + self._aggregator.publish_event(data) + + self.assertEqual(len(self._aggregator.publish_output.mock_calls), 1) + publish_output_kwargs = self._aggregator.publish_output.mock_calls[0][-1] + self.assertEqual(set(publish_output_kwargs.keys()), {"routing_key", "body"}) + self.assertEqual(publish_output_kwargs["routing_key"], expected_routing_key) + self.assertJsonEqual(publish_output_kwargs["body"], expected_body_content) + + + def test_input_callback(self): + with patch.object(Aggregator, "process_event") as process_event_mock: + self._aggregator.input_callback("testsource.testchannel", + self.input_callback_proper_msg, + self.sample_routing_key) + process_event_mock.assert_called_with(json.loads(self.input_callback_proper_msg)) + + + def test_input_callback_with__group_missing(self): + with self.assertRaisesRegex(n6QueueProcessingException, r"\bmissing '_group' field\b"): + with patch.object(Aggregator, "process_event"): + self._aggregator.input_callback("testsource.testchannel", + self.input_callback_msg_no__group, + self.sample_routing_key) + + @patch("n6datapipeline.base.LegacyQueuedBase.__init__", autospec=True) + @patch("n6lib.config.Config._load_n6_config_files", return_value=mocked_config) + def test_init_class(self, config_mock, init_mock): + with tempfile.NamedTemporaryFile() as fp: + config_mock.return_value["aggregator"]["dbpath"] = fp.name + self._aggregator.__init__() + + # store dir does not exist + with tempfile.NamedTemporaryFile() as fp, \ + self.assertRaisesRegex(Exception, r"store dir does not exist, stop aggregator"): + config_mock.return_value["aggregator"]["dbpath"] = os.path.join(fp.name, + "nonexistent_file") + self._aggregator.__init__() + + # store directory exists, but it has no rights to write + with tempfile.NamedTemporaryFile() as fp, \ + patch("os.access", return_value=None), \ + self.assertRaisesRegex(Exception, + r"stop aggregator, remember to set the rights for user, " + r"which runs aggregator"): + config_mock.return_value["aggregator"]["dbpath"] = fp.name + self._aggregator.__init__() + + + def _mocked_utcnow_method(self): + """ + Helper method used as a side effect of a mocked + datetime.datetime.utcnow() method. Increment the counter + during each call, which will indicate how many times + utcnow() was called. + """ + self._mocked_datetime_counter += 1 + return self.mocked_utcnow + + def _test_process_event(self, + input_data, + expected_ids_to_single_events, + expected_ids_to_suppressed_events, + expected_last_event_dt_updates): + """ + Use input data to call Aggregator's `process_event()` method; + use it to create expected events and compare it with events + crated based on arguments that QueuedBase's `publish_output()` + method was called with (`publish_output()` normally, if not + mocked, would publish actual events created from + this arguments). + """ + expected_events = [] + + with patch("n6datapipeline.aggregator.datetime") as datetime_mock,\ + patch.object(Aggregator, "publish_output") as publish_output_mock: + datetime_mock.datetime.utcnow.side_effect = self._mocked_utcnow_method + datetime_mock.datetime.side_effect = (lambda *args, **kw: + datetime.datetime(*args, **kw)) + # a `SourceData` attribute `time_tolerance` needs + # a `datetime.timedelta` instance, but it is mocked now + datetime_mock.timedelta.side_effect = (lambda *args, **kw: + datetime.timedelta(*args, **kw)) + for event in input_data: + if expected_ids_to_single_events and event["id"] in expected_ids_to_single_events: + expected_events.append( + self._get_expected_event_from_input_data(event.copy(), "event")) + if (expected_ids_to_suppressed_events + and event["id"] in expected_ids_to_suppressed_events): + new_suppressed = event.copy() + new_suppressed.update(expected_ids_to_suppressed_events[event["id"]]) + expected_events.append( + self._get_expected_event_from_input_data(new_suppressed, "suppressed")) + self._aggregator.process_event(event) + events_from_calls = self._get_events_from_calls(publish_output_mock.call_args_list) + self.assertCountEqual(expected_events, events_from_calls) + # Check how many times datetime.datetime.utcnow() was called, + # meaning how many times the `SourceData` instance's + # `last_event` attribute was updated. It should not be updated + # when the event is out of order (we assume the source was not + # active if it published an old event). + self.assertEqual(self._mocked_datetime_counter, expected_last_event_dt_updates) + + + @staticmethod + def _get_expected_event_from_input_data(input_data, type_): + """ + Turn an input data to event-like dicts, that are expected + to be created during the calls to `process_event()` method. + Args: + `input_data`: + a dict with input data. + `type_`: + a type of event ('event' or 'suppressed'). + + Returns: + an event-like dict, that is expected to be created + during the call to `process_event()`. + """ + input_data.update({"type": type_}) + # final events do not contain field `_group` + del input_data["_group"] + return { + "body": input_data, + "routing_key": "{}.aggregated.{}".format(type_, input_data['source']), + } + + + @staticmethod + def _get_events_from_calls(call_args_list): + """ + Turn a list of calls to method to actual event-like + dicts, which would be created during a regular Aggregator + run. + """ + events_from_calls = [] + for _, call_args in call_args_list: + events_from_calls.append({"body": json.loads(call_args["body"]), + "routing_key": call_args["routing_key"]}) + return events_from_calls + + +@expand +class TestAggregatorDataWrapper(unittest.TestCase): + + tested_source_channel = "testsource.testchannel" + other_source_channel = "othersource.otherchannel" + sample_db_path = "/tmp/example.pickle" + sample_time_tolerance = 600 + sample_time_tolerance_per_source = { + other_source_channel: 1200, + } + mocked_utcnow = datetime.datetime(2017, 7, 1, 12, 0, 0) + sources_tested_for_inactivity = [tested_source_channel, other_source_channel] + + group1_expected_suppressed_payload = dict( + count=5, + _first_time="2017-06-01 07:00:00", + id="c4ca4238a0b923820dcc509a6f75849b", + source=tested_source_channel, + time="2017-06-01 07:00:00", + _group="group1", + until="2017-06-01 09:00:00", + ) + group1_expected_suppressed_event = ( + "suppressed", + group1_expected_suppressed_payload, + ) + group2_expected_suppressed_payload = dict( + count=4, + _first_time="2017-06-01 08:00:00", + id="c4ca4238a0b923820dcc509a6f75849c", + source=tested_source_channel, + time="2017-06-01 08:00:00", + _group="group2", + until="2017-06-01 10:00:00", + ) + group2_expected_suppressed_event = ( + "suppressed", + group2_expected_suppressed_payload, + ) + group3_expected_suppressed_event = ( + "suppressed", + None, + ) + + # The namedtuple's fields are being used to describe expected + # `HiFreqEventData` class instances' - objects containing + # aggregated events, created during test calls to + # the `process_new_message()` method. + # `ExpectedHiFreqData` fields: + # 'name': an expected name of a key of + # the `AggregatorDataWrapper`.`groups` attribute, whose + # value should be the expected `HiFreqEventData` instance. + # 'until', 'first' and 'count': fields that explicitly + # correspond to `HiFreqEventData` instance's attributes. + # 'msg_index_to_payload': an index of element in the `messages` + # param, a dict, that is expected to be equal to a 'payload' + # attribute of the `HiFreqEventData` instance. + ExpectedHiFreqData = namedtuple( + "ExpectedHiFreqData", ("name", "until", "first", "count", "msg_index_to_payload")) + + + @paramseq + def _test_process_new_message_data(cls): + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 1, 10), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 10), + first=datetime.datetime(2017, 6, 1, 10), + count=1, + msg_index_to_payload=0, + ), + ], + ) + + # Second message fits to specific `time_tolerance` parameter + # for the source. + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.other_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850c", + "source": cls.other_source_channel, + "_group": "group1", + "time": "2017-06-01 09:40:00", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 1, 10), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 10), + first=datetime.datetime(2017, 6, 1, 10), + count=2, + msg_index_to_payload=0, + ), + ], + ) + + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850c", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 12:00:00", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 1, 12), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 10), + first=datetime.datetime(2017, 6, 1, 10), + count=1, + msg_index_to_payload=0, + ), + cls.ExpectedHiFreqData( + name="group2", + until=datetime.datetime(2017, 6, 1, 12), + first=datetime.datetime(2017, 6, 1, 12), + count=1, + msg_index_to_payload=1, + ), + ], + ) + + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 11:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850c", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 12:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75851c", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 13:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 14:00:00", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 1, 14), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 14), + first=datetime.datetime(2017, 6, 1, 10), + count=3, + msg_index_to_payload=0, + ), + cls.ExpectedHiFreqData( + name="group2", + until=datetime.datetime(2017, 6, 1, 13), + first=datetime.datetime(2017, 6, 1, 12), + count=2, + msg_index_to_payload=2, + ), + ], + ) + + # Messages of the "group1" are aggregated until the message + # from next day comes in. It triggers publishing of aggregated + # messages, and a `HiFreqEventData` for "group1" events + # is replaced by the new instance. + # *Important*: aggregated messages of different groups + # from the same source would also be published in this + # situation, but it happens later, in the `Aggregator`'s + # instance `process_event()` method. + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75851b", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 10:15:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75751c", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 10:30:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 11:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850c", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 12:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75851c", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 13:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-02 14:00:00", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 2, 14), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 2, 14), + first=datetime.datetime(2017, 6, 2, 14), + count=1, + msg_index_to_payload=6, + ), + cls.ExpectedHiFreqData( + name="group2", + until=datetime.datetime(2017, 6, 1, 10, 30), + first=datetime.datetime(2017, 6, 1, 10, 15), + count=2, + msg_index_to_payload=1, + ), + ], + expected_buffers=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 13), + first=datetime.datetime(2017, 6, 1, 10), + count=4, + msg_index_to_payload=0, + ), + ], + ) + + # Messages of the "group1" are aggregated until the message + # newer by more than 12 hours (by default) is processed. + # It triggers publishing of aggregated + # messages, and a `HiFreqEventData` for "group1" events + # is replaced by the new instance. + # *Important*: aggregated messages of different groups + # from the same source would also be published in this + # situation, but it happens later, in the `Aggregator`'s + # instance `process_event()` method. + yield param( + messages=[ + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 07:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 08:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75751b", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 08:10:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75851b", + "source": cls.tested_source_channel, + "_group": "group2", + "time": "2017-06-01 08:30:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75850c", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 09:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75851c", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 10:00:00", + }, + { + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 22:00:01", + }, + ], + expected_source_time=datetime.datetime(2017, 6, 1, 22, 0, 1), + expected_groups=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 22, 0, 1), + first=datetime.datetime(2017, 6, 1, 22, 0, 1), + count=1, + msg_index_to_payload=6, + ), + cls.ExpectedHiFreqData( + name="group2", + until=datetime.datetime(2017, 6, 1, 8, 30), + first=datetime.datetime(2017, 6, 1, 8, 10), + count=2, + msg_index_to_payload=2, + ), + ], + expected_buffers=[ + cls.ExpectedHiFreqData( + name="group1", + until=datetime.datetime(2017, 6, 1, 10), + first=datetime.datetime(2017, 6, 1, 7), + count=4, + msg_index_to_payload=0, + ), + ], + ) + + + @paramseq + def _test_generate_suppressed_events_for_source_data(cls): + # The newest message is from the next day comparing + # to previous events from "group1" and "group2", + # "suppressed" events for both groups should be + # generated. Data for the "group3" is None, + # because the group has only one event. + yield param( + new_message={ + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-02 10:00:01", + }, + expected_results=[ + cls.group1_expected_suppressed_event, + cls.group2_expected_suppressed_event, + cls.group3_expected_suppressed_event, + ], + ) + + # The newest message is more than 12 hours newer + # than the previous event of the "group1", but not + # the event of the "group2" - a "suppressed" event + # should be generated only of the "group1". Because + # of the "group2" not meeting the condition - checks + # of next groups are not performed. + yield param( + new_message={ + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 21:10:00", + }, + expected_results=[ + cls.group1_expected_suppressed_event, + ], + ) + + # The newest message is more than 12 hours newer + # than the previous event of both groups, "suppressed" + # events should be generated for "group1" and "group2". + # Data for the "group3" is None, because the group + # has only one event. + yield param( + new_message={ + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": cls.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 22:10:00", + }, + expected_results=[ + cls.group1_expected_suppressed_event, + cls.group2_expected_suppressed_event, + cls.group3_expected_suppressed_event, + ], + ) + + + @paramseq + def _test_generate_suppressed_events_after_timeout_data(cls): + # more than 24 hours has passed since processing of last + # event for the source "testsource.testchannel" + yield param( + mocked_utcnow=datetime.datetime(2017, 6, 2, 15), + expected_inactive_sources=[ + cls.tested_source_channel, + ], + ) + + # more than 24 hours has passed since processing of last + # event for both sources + yield param( + mocked_utcnow=datetime.datetime(2017, 6, 2, 20, 2), + expected_inactive_sources=[ + cls.tested_source_channel, + cls.other_source_channel, + ], + ) + + # more than 24 hours has not passed for any of tested sources + yield param( + mocked_utcnow=datetime.datetime(2017, 6, 2, 14), + expected_inactive_sources=[], + ) + + def setUp(self): + self._adw = AggregatorDataWrapper.__new__(AggregatorDataWrapper) + self._adw.time_tolerance = self.sample_time_tolerance + self._adw.time_tolerance_per_source = self.sample_time_tolerance_per_source + self._adw.dbpath = self.sample_db_path + self._adw.aggr_data = AggregatorData() + + def test_store_restore_state(self): + """ + Check validity of data stored in Pickle object and saved as temporary files + comparing its restored state. + """ + message = { + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": self.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 22:10:00", + } + + expected_stored_message = { + "id": "c4ca4238a0b923820dcc509a6f75852b", + "source": self.tested_source_channel, + "_group": "group1", + "time": "2017-06-01 22:10:00", + } + + self._adw.process_new_message(message) + with tempfile.NamedTemporaryFile() as fp: + self._adw.dbpath = fp.name + # store the state + self._adw.store_state() + # delete attribute with stored sources + del self._adw.aggr_data + # check restored state from existing file + self._adw.restore_state() + self.assertDictEqual( + self._adw.aggr_data.sources[self.tested_source_channel].groups[ + message["_group"]].payload, + expected_stored_message) + # assert given path exist + self.assertTrue(self._adw.dbpath) + # assert the exception is being raised when trying to store + # the state, but there is no access to the given path; first, + # make sure there actually is no access to the given path + tmp_db_path = "/root/example.pickle" + assert not os.access(tmp_db_path, os.W_OK), ('The test case relies on the assumption that ' + 'the user running the tests does not ' + 'have permission to write ' + 'to: {!a}'.format(tmp_db_path)) + self._adw.dbpath = tmp_db_path + with self.assertLogs(level='ERROR') as patched_logger: + self._adw.store_state() + self.assertEqual(patched_logger.output, ["ERROR:n6datapipeline.aggregator:Error saving " + "state to: '/root/example.pickle'"]) + # assert the exception is being raised when trying to restore + # the state from nonexistent file; first, safely create + # a temporary file, then close and remove it, so the path + # most likely does not exist + with tempfile.NamedTemporaryFile() as fp: + tmp_db_path = fp.name + assert not os.path.exists(tmp_db_path), ('The randomly generated temporary directory: ' + '{!a} still exists, so the test cannot ' + 'be correctly performed'.format(tmp_db_path)) + with patch.object(self._adw, "dbpath", tmp_db_path), \ + self.assertRaisesRegex(IOError, r"No such file or directory"): + self._adw.restore_state() + + @foreach(_test_process_new_message_data) + def test_process_new_message(self, messages, expected_source_time, + expected_groups, expected_buffers=None): + """ + Check validity of data inside tested source's `groups` + and `buffer` attributes after processing of consecutive + messages. + """ + test_sources = [] + with patch("n6datapipeline.aggregator.datetime") as datetime_mock: + datetime_mock.datetime.utcnow.return_value = self.mocked_utcnow + datetime_mock.datetime.side_effect = (lambda *args, **kw: + datetime.datetime(*args, **kw)) + # a `SourceData` attribute `time_tolerance` needs + # a `datetime.timedelta` instance, but it is mocked now + datetime_mock.timedelta.side_effect = (lambda *args, **kw: + datetime.timedelta(*args, **kw)) + + # actual calls + for msg in messages: + self._adw.process_new_message(msg) + if msg["source"] not in test_sources: + test_sources.append(msg["source"]) + + for test_source in test_sources: + # assertions for the source + created_source = self._adw.aggr_data.sources[test_source] + self.assertEqual(created_source.last_event, self.mocked_utcnow) + self.assertEqual(created_source.time, expected_source_time) + self.assertEqual(len(expected_groups), len(created_source.groups)) + + # assertions for groups + for expected_group in expected_groups: + self.assertIn(expected_group.name, created_source.groups) + created_group = created_source.groups[expected_group.name] + self.assertIsInstance(created_group, HiFreqEventData) + self.assertEqual(expected_group.until, created_group.until) + self.assertEqual(expected_group.first, created_group.first) + self.assertEqual(expected_group.count, created_group.count) + self.assertEqual( + messages[expected_group.msg_index_to_payload], + created_group.payload) + # assertions for potential buffers + if expected_buffers: + for expected_buffer in expected_buffers: + created_buffer = created_source.buffer[expected_buffer.name] + self.assertEqual(expected_buffer.until, created_buffer.until) + self.assertEqual(expected_buffer.first, created_buffer.first) + self.assertEqual(expected_buffer.count, created_buffer.count) + self.assertEqual( + messages[expected_buffer.msg_index_to_payload], + created_buffer.payload) + + + @foreach(_test_generate_suppressed_events_for_source_data) + def test_generate_suppressed_events_for_source(self, new_message, expected_results): + """ + Check, if "suppressed" events are generated when a newly + processed message's time is greater than its group's `until` + time by the specified timeout (12 hours by default), or + the message is from another day. + """ + tested_source_data = self._get_source_data_for_suppressed_events_tests( + self.tested_source_channel) + another_source_data = self._get_source_data_for_suppressed_events_tests( + self.other_source_channel) + hifreq_new_data = HiFreqEventData(new_message) + tested_source_data.groups["group1"] = hifreq_new_data + # `time` attribute should be equal to last message's + tested_source_data.time = datetime.datetime.strptime( + new_message["time"], "%Y-%m-%d %H:%M:%S") + another_source_data.time = datetime.datetime(2017, 6, 1, 10) + # `last_event` attribute is not relevant for the test + tested_source_data.last_event = datetime.datetime(2017, 6, 2, 20) + another_source_data.last_event = datetime.datetime(2017, 6, 2, 20) + self._adw.aggr_data.sources[self.tested_source_channel] = tested_source_data + self._adw.aggr_data.sources[self.other_source_channel] = another_source_data + + generated_events = list(self._adw.generate_suppresed_events_for_source(new_message)) + self.assertCountEqual(expected_results, generated_events) + # new `HiFreqEventData` object of the "group1" should be + # in `groups` attribute, but not in `buffer` - suppressed + # event of the "group1" should have been generated + self.assertIn( + "group1", self._adw.aggr_data.sources[self.tested_source_channel].groups) + self.assertNotIn( + "group1", self._adw.aggr_data.sources[self.tested_source_channel].buffer) + # if aggregated events of the "group2" were generated, then + # there should not be any `HiFreqEventData` objects of this + # group in `groups` nor `buffer` attribute + if self.group2_expected_suppressed_event in expected_results: + self.assertNotIn( + "group2", self._adw.aggr_data.sources[self.tested_source_channel].groups) + self.assertNotIn( + "group2", self._adw.aggr_data.sources[self.tested_source_channel].buffer) + + # check if the other source's elements, for which suppressed + # events were not generated, are unchanged + self.assertIn( + "group2", self._adw.aggr_data.sources[self.other_source_channel].groups) + self.assertIn( + "group1", self._adw.aggr_data.sources[self.other_source_channel].buffer) + + + @foreach(_test_generate_suppressed_events_after_timeout_data) + def test_generate_suppressed_events_after_timeout(self, + mocked_utcnow, + expected_inactive_sources): + """ + Test, whether sources are treated as inactive after specified + timeout, and if proper suppressed events are generated + for them. + """ + tested_source_data = self._get_source_data_for_suppressed_events_tests( + self.tested_source_channel) + another_source_data = self._get_source_data_for_suppressed_events_tests( + self.other_source_channel) + # `time` attribute should be equal to last message's + tested_source_data.time = datetime.datetime(2017, 6, 1, 10) + another_source_data.time = datetime.datetime(2017, 6, 1, 10) + tested_source_data.last_event = datetime.datetime(2017, 6, 1, 14) + another_source_data.last_event = datetime.datetime(2017, 6, 1, 20) + self._adw.aggr_data.sources[self.tested_source_channel] = tested_source_data + self._adw.aggr_data.sources[self.other_source_channel] = another_source_data + + source_to_expected_events = self._get_source_to_expected_events_mapping() + + with patch("n6datapipeline.aggregator.datetime") as datetime_mock: + datetime_mock.datetime.utcnow.return_value = mocked_utcnow + datetime_mock.datetime.side_effect = (lambda *args, **kw: + datetime.datetime(*args, **kw)) + # a `SourceData` attribute `time_tolerance` needs + # a `datetime.timedelta` instance, but it is mocked now + datetime_mock.timedelta.side_effect = (lambda *args, **kw: + datetime.timedelta(*args, **kw)) + # actual call + generated_events = list(self._adw.generate_suppresed_events_after_timeout()) + expected_events = [event for source, vals in source_to_expected_events.items() + if source in expected_inactive_sources for event in vals] + self.assertEqual(expected_events, generated_events) + + for source in self.sources_tested_for_inactivity: + # check if `groups` and `buffers` were cleared + # for inactive sources + if source in expected_inactive_sources: + self.assertFalse(self._adw.aggr_data.sources[source].groups) + self.assertFalse(self._adw.aggr_data.sources[source].buffer) + # make sure `groups` and `buffers` were intact + # for still active sources + else: + self.assertTrue(self._adw.aggr_data.sources[source].groups) + self.assertTrue(self._adw.aggr_data.sources[source].buffer) + + + # helper methods + def _get_source_data_for_suppressed_events_tests(self, source_name): + source_data = SourceData(self._get_time_tolerance_from_source(source_name)) + + group1_hifreq_buffered_data = HiFreqEventData.__new__(HiFreqEventData) + group1_hifreq_buffered_data.payload = { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": source_name, + "_group": "group1", + "time": "2017-06-01 07:00:00", + } + group1_hifreq_buffered_data.first = datetime.datetime(2017, 6, 1, 7) + group1_hifreq_buffered_data.until = datetime.datetime(2017, 6, 1, 9) + group1_hifreq_buffered_data.count = 5 + source_data.buffer["group1"] = group1_hifreq_buffered_data + + group2_hifreq_data = HiFreqEventData.__new__(HiFreqEventData) + group2_hifreq_data.payload = { + "id": "c4ca4238a0b923820dcc509a6f75849c", + "source": source_name, + "_group": "group2", + "time": "2017-06-01 08:00:00", + } + group2_hifreq_data.until = datetime.datetime(2017, 6, 1, 10) + group2_hifreq_data.first = datetime.datetime(2017, 6, 1, 8) + group2_hifreq_data.count = 4 + source_data.groups["group2"] = group2_hifreq_data + + group3_payload = { + "id": "c4ca4238a0b923820dcc509a6f75849d", + "source": source_name, + "_group": "group3", + "time": "2017-06-01 07:30:00", + } + group3_hifreq_data = HiFreqEventData(group3_payload) + source_data.groups["group3"] = group3_hifreq_data + + return source_data + + def _get_source_to_expected_events_mapping(self): + group1_other_source_payload = self.group1_expected_suppressed_payload.copy() + group1_other_source_payload["source"] = self.other_source_channel + group1_other_source_event = ("suppressed", group1_other_source_payload) + group2_other_source_payload = self.group2_expected_suppressed_payload.copy() + group2_other_source_payload["source"] = self.other_source_channel + group2_other_source_event = ("suppressed", group2_other_source_payload) + group3_other_source_event = self.group3_expected_suppressed_event + return { + self.tested_source_channel: [ + self.group1_expected_suppressed_event, + self.group2_expected_suppressed_event, + self.group3_expected_suppressed_event, + ], + self.other_source_channel: [ + group1_other_source_event, + group2_other_source_event, + group3_other_source_event, + ], + } + + def _get_time_tolerance_from_source(self, source): + return self.sample_time_tolerance_per_source.get(source) or self.sample_time_tolerance + + +class TestAggregatorData(unittest.TestCase): + + sample_source = "testsource.testchannel" + sample_other_source = "othersource.otherchannel" + sample_group = "group1" + sample_other_group = "group2" + sample_time_tolerance = 500 + sample_time_tolerance_per_source = { + sample_other_source: 1000, + } + + groups_hifreq_data = HiFreqEventData( + { + "id": "c4ca4238a0b923820dcc509a6f75849c", + "source": sample_source, + "_group": sample_group, + "time": "2017-06-02 12:00:00", + } + ) + buffer_hifreq_data = HiFreqEventData( + { + "id": "c4ca4238a0b923820dcc509a6f75849b", + "source": sample_source, + "_group": sample_group, + "time": "2017-06-01 10:00:00", + } + ) + + def setUp(self): + self._aggregator_data = AggregatorData() + self._sample_source_data = SourceData(self.sample_time_tolerance) + self._sample_source_data.time = datetime.datetime(2017, 6, 2, 12) + self._sample_source_data.last_event = datetime.datetime(2017, 6, 2, 13) + self._sample_source_data.groups[self.sample_group] = self.groups_hifreq_data + self._sample_source_data.buffer[self.sample_group] = self.buffer_hifreq_data + self._aggregator_data.sources[self.sample_source] = self._sample_source_data + + def test_create_new_source_data(self): + source_data = self._aggregator_data.get_or_create_sourcedata( + { + "id": "c4ca4238a0b923820dcc509a6f75851d", + "source": self.sample_other_source, + "_group": self.sample_group, + "time": "2017-05-01 12:00:00", + }, + self._get_time_tolerance_from_source(self.sample_other_source)) + self.assertIsInstance(source_data, SourceData) + self.assertEqual(source_data.time, None) + self.assertEqual(source_data.last_event, None) + self.assertFalse(source_data.groups) + self.assertFalse(source_data.buffer) + self.assertEqual( + source_data.time_tolerance, + datetime.timedelta(seconds=self._get_time_tolerance_from_source( + self.sample_other_source))) + self.assertIs(source_data, self._aggregator_data.sources[self.sample_other_source]) + + def test_get_existing_source_data(self): + source_data = self._aggregator_data.get_or_create_sourcedata( + { + "id": "c4ca4238a0b923820dcc509a6f75860f", + "source": self.sample_source, + "_group": self.sample_other_group, + "time": "2017-05-01 12:00:00", + }, + self._get_time_tolerance_from_source(self.sample_other_source)) + self.assertIsInstance(source_data, SourceData) + self.assertEqual(source_data.time, self._sample_source_data.time) + self.assertEqual(source_data.last_event, self._sample_source_data.last_event) + self.assertEqual( + source_data.time_tolerance, + datetime.timedelta(seconds=self._get_time_tolerance_from_source(self.sample_source))) + self.assertIn(self.sample_group, source_data.groups) + self.assertIn(self.sample_group, source_data.buffer) + self.assertEqual(1, len(source_data.groups)) + self.assertEqual(1, len(source_data.buffer)) + self.assertEqual(self.groups_hifreq_data, source_data.groups[self.sample_group]) + self.assertEqual(self.buffer_hifreq_data, source_data.buffer[self.sample_group]) + self.assertIs(source_data, self._aggregator_data.sources[self.sample_source]) + + def _get_time_tolerance_from_source(self, source): + return self.sample_time_tolerance_per_source.get(source) or self.sample_time_tolerance diff --git a/N6DataPipeline/n6datapipeline/tests/test_enrich.py b/N6DataPipeline/n6datapipeline/tests/test_enrich.py new file mode 100644 index 0000000..fa6f799 --- /dev/null +++ b/N6DataPipeline/n6datapipeline/tests/test_enrich.py @@ -0,0 +1,1377 @@ +# Copyright (c) 2013-2021 NASK. All rights reserved. + +import datetime +import hashlib +import os +import unittest +import unittest.mock + +import iptools +from geoip2.errors import GeoIP2Error +from dns.exception import DNSException + +from n6datapipeline.enrich import Enricher +from n6lib.record_dict import RecordDict +from n6lib.unit_test_helpers import TestCaseMixin + + +DEFAULT_GEO_IP_DB_PATH = '/usr/share/GeoIP' +DEFAULT_ASN_DB_FILENAME = 'GeoLite2-ASN.mmdb' +DEFAULT_CC_DB_FILENAME = 'GeoLite2-City.mmdb' + + +class MockASNReader(unittest.mock.Mock): + + asn = unittest.mock.Mock(return_value=unittest.mock.MagicMock(autonomous_system_number="1234")) + + +class MockCCReader(unittest.mock.Mock): + + city = unittest.mock.Mock(return_value=unittest.mock.MagicMock(country=unittest.mock.MagicMock(iso_code="PL"))) + + +class MockReader(object): + + def __new__(cls, fileish, *args, **kwargs): + filename = os.path.basename(fileish) + if filename == DEFAULT_ASN_DB_FILENAME: + return MockASNReader() + elif filename == DEFAULT_CC_DB_FILENAME: + return MockCCReader() + raise ValueError('Unrecognized name of GeoIP database file: {!a}. ' + 'Should be one of: {!a}, {!a}'.format(filename, + DEFAULT_ASN_DB_FILENAME, + DEFAULT_CC_DB_FILENAME)) + + +class _BaseTestEnricher(TestCaseMixin): + + """ + The class defines methods returning input test data. + Concrete classes should extend these methods, by taking + returned data and building assertions against expected + data. + + These tests should be common for cases testing the Enricher + with all the GeoIP databases, as well as only ASN, only CC + or none of them enabled. + """ + + COMMON_DATA = { + "category": "other", + "confidence": "low", + "restriction": "public", + "source": "test.test", + "time": str(datetime.datetime.now()), + "id": hashlib.md5("test".encode(encoding='utf-8')).hexdigest(), + "rid": hashlib.md5("test".encode(encoding='utf-8')).hexdigest(), + } + MOCK_CONFIG = NotImplemented + + @unittest.mock.patch('n6datapipeline.base.LegacyQueuedBase.get_connection_params_dict') + @unittest.mock.patch('n6datapipeline.enrich.database.Reader', MockReader) + @unittest.mock.patch('n6datapipeline.enrich.ConfigMixin.get_config_section') + def setUp(self, config_mock, *args): + config_mock.return_value = self.MOCK_CONFIG + Enricher._setup_dnsresolver = unittest.mock.MagicMock() + self.enricher = Enricher() + self.enricher._resolver = unittest.mock.MagicMock() + self.enricher._resolver.query = unittest.mock.MagicMock(return_value=["127.0.0.1"]) + + def test__ip_to_asn__called_or_not(self): + """ + Prepare for a test, whether the `ip_to_asn()` method was + called for all IP addresses, or not. + """ + self.enricher.ip_to_asn = unittest.mock.MagicMock(return_value="") + data = self._make_actions_to_call_geoip_method_get_data() + return data + + def test__ip_to_cc__called_or_not(self): + """ + Prepare for a test, whether the `ip_to_cc()` method was + called for all IP addresses, or not. + """ + self.enricher.ip_to_cc = unittest.mock.MagicMock(return_value="") + data = self._make_actions_to_call_geoip_method_get_data() + return data + + def test__enrich__with_fqdn_given(self): + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + return data + + def test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates(self): + self.enricher._resolver.query.return_value = [ + '2.2.2.2', + '127.0.0.1', + '13.1.2.3', + '1.1.1.1', + '127.0.0.1', # duplicate + '13.1.2.3', # duplicate + '12.11.10.9', + '13.1.2.3', # duplicate + '1.0.1.1', + ] + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + return data + + def test__enrich__with_url_given(self): + data = self.enricher.enrich(RecordDict({"url": "http://www.nask.pl/asd"})) + self.enricher._resolver.query.assert_called_once_with("www.nask.pl", "A") + return data + + def test__enrich__with_ip_url_given(self): + return self.enricher.enrich(RecordDict({"url": "http://192.168.0.1/asd"})) + + def test__enrich__with_ip_url_given__with_nodns_flag(self): + return self.enricher.enrich(RecordDict({ + "url": "http://192.168.0.1/asd", + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_url_given(self): + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl", + "url": "http://www.nask.pl/asd"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + return data + + def test__enrich__with_fqdn_and_ip_url_given(self): + data = self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + return data + + def test__enrich__with_address_and_fqdn_given(self): + return self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "address": [{"ip": "10.20.30.40"}]})) + + def test__enrich__with_address_and_fqdn_given__with_nodns_flag(self): + return self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "address": [{"ip": "10.20.30.40"}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_address_and_url_given(self): + return self.enricher.enrich(RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{"ip": "10.20.30.40"}]})) + + def test__enrich__with_address_and_ip_url_given(self): + return self.enricher.enrich(RecordDict({ + "url": "http://192.168.0.3/asd", + "address": [{"ip": "10.20.30.40"}]})) + + def test__enrich__with_address_and_fqdn_and_url_given(self): + return self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "address": [{"ip": "10.20.30.40"}]})) + + def test__enrich__with_address_and_fqdn_and_ip_url_given(self): + return self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd", + "address": [{"ip": "10.20.30.40"}]})) + + def test__enrich__with_excluded_ips_config__without_any_ip_to_exclude(self): + self._prepare_config_for_excluded_ips(['2.2.2.2', '3.3.3.3']) + self.enricher.excluded_ips = self.enricher._get_excluded_ips() + data = self.enricher.enrich(RecordDict({"url": "http://www.nask.pl/asd"})) + self.enricher._resolver.query.assert_called_once_with("www.nask.pl", "A") + return data + + # helper methods + def _prepare_config_for_excluded_ips(self, list_of_ips): + self.enricher._enrich_config = {'excluded_ips': list_of_ips} + + @staticmethod + def _get_actual_data_for_adding_asn_cc_if_possible(): + return RecordDict({ + "address": [{"ip": "127.0.0.1"}, + {"ip": "192.187.0.1"}, + {"ip": "10.15.1.255"}]}) + + @staticmethod + def _get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible(): + return RecordDict({ + "address": [{"ip": "127.0.0.1", "cc": "JP"}, + {"ip": "192.187.0.1", "cc": "US", "asn": 424242}, + {"ip": "10.15.1.255", "asn": 434343}]}) + + def _enricher_execution_helper(self, data_init, expected_num_of_warnings=None): + data = data_init + data.update(self.COMMON_DATA) + self.enricher.enrich(data) + self.expected_num_of_warnings = expected_num_of_warnings + return data + + def _set_asn_db_return_value_if_enabled(self, returned_asn): + if self.enricher.gi_asn is not None: + self.assertTrue(self.enricher.is_geodb_enabled) + self.enricher.gi_asn = unittest.mock.Mock() + self.enricher.gi_asn.asn = unittest.mock.Mock( + return_value=unittest.mock.MagicMock(autonomous_system_number=returned_asn)) + + def _set_asn_db_side_effect_if_enabled(self, side_effect): + if self.enricher.gi_asn is not None: + self.assertTrue(self.enricher.is_geodb_enabled) + self.enricher.gi_asn = unittest.mock.Mock() + self.enricher.gi_asn.asn = unittest.mock.MagicMock(side_effect=side_effect) + + def _set_cc_db_return_value_if_enabled(self, returned_cc): + if self.enricher.gi_cc is not None: + self.assertTrue(self.enricher.is_geodb_enabled) + self.enricher.gi_cc = unittest.mock.Mock() + self.enricher.gi_cc.city = unittest.mock.Mock( + return_value=unittest.mock.Mock(country=unittest.mock.Mock(iso_code=returned_cc))) + + def _set_cc_db_side_effect_if_enabled(self, side_effect): + if self.enricher.gi_cc is not None: + self.assertTrue(self.enricher.is_geodb_enabled) + self.enricher.gi_cc = unittest.mock.Mock() + self.enricher.gi_cc.city = unittest.mock.MagicMock(side_effect=side_effect) + + def _make_actions_to_call_geoip_method_get_data(self): + data = RecordDict({ + "address": [{"ip": "127.0.0.1"}, + {"ip": "192.187.0.1"}, + {"ip": "10.15.1.255"}]}) + data.update(self.COMMON_DATA) + self.enricher.enrich(data) + return data + + def _assert_geoip_method_called(self, meth, data): + for addr in data["address"]: + meth.assert_any_call(addr["ip"]) + self.assertEqual(len(data["address"]), meth.call_count) + + def _assert_geoip_method_not_called(self, meth): + self.assertFalse(meth.called) + + +class TestEnricherWithFullConfig(_BaseTestEnricher, unittest.TestCase): + + MOCK_CONFIG = { + 'dnshost': '8.8.8.8', + 'dnsport': 53, + 'geoippath': DEFAULT_GEO_IP_DB_PATH, + 'asndatabasefilename': DEFAULT_ASN_DB_FILENAME, + 'citydatabasefilename': DEFAULT_CC_DB_FILENAME, + 'excluded_ips': [], + } + + def test__ip_to_asn__called_or_not(self): + data = super(TestEnricherWithFullConfig, self).test__ip_to_asn__called_or_not() + self._assert_geoip_method_called(self.enricher.ip_to_asn, data) + + def test__ip_to_cc__called_or_not(self): + data = super(TestEnricherWithFullConfig, self).test__ip_to_cc__called_or_not() + self._assert_geoip_method_called(self.enricher.ip_to_cc, data) + + def test__enrich__with_no_data(self): + data = self.enricher.enrich(RecordDict({})) + self.assertEqualIncludingTypes(data, RecordDict({'enriched': ([], {})})) + + def test__enrich__with_irrelevant_data(self): + data = self.enricher.enrich(RecordDict(self.COMMON_DATA)) + self.assertEqualIncludingTypes(data, RecordDict(dict(self.COMMON_DATA, **{ + 'enriched': ([], {})}))) + + def test__enrich__with_fqdn_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_fqdn_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "cc", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_fqdn_given__with_nodns_flag(self): + data = self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "_do_not_resolve_fqdn_to_ip": True})) + self.assertFalse(self.enricher._resolver.query.called) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "fqdn": "cert.pl", + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"1.0.1.1": ["asn", "cc", "ip"], + "1.1.1.1": ["asn", "cc", "ip"], + "12.11.10.9": ["asn", "cc", "ip"], + "127.0.0.1": ["asn", "cc", "ip"], + "13.1.2.3": ["asn", "cc", "ip"], + "2.2.2.2": ["asn", "cc", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '1.0.1.1', # note: *removed IP duplicates* and + "asn": '1234', # *ordered* by IP (textually) + "cc": 'PL'}, + {"ip": '1.1.1.1', + "asn": '1234', + "cc": 'PL'}, + {"ip": '12.11.10.9', + "asn": '1234', + "cc": 'PL'}, + {"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}, + {"ip": '13.1.2.3', + "asn": '1234', + "cc": 'PL'}, + {"ip": '2.2.2.2', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_url_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["asn", "cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_url_given__with_nodns_flag(self): + data = self.enricher.enrich(RecordDict({ + "url": "http://www.nask.pl/asd", + "_do_not_resolve_fqdn_to_ip": True})) + self.assertFalse(self.enricher._resolver.query.called) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_wrong_url_given(self): + data = self.enricher.enrich(RecordDict({"url": "http://http://www.nask.pl/asd"})) + self.assertEqual(self.enricher._resolver.mock_calls, []) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "url": "http://http://www.nask.pl/asd"})) + + def test__enrich__with_fqdn_not_resolved(self): + self.enricher._resolver.query = unittest.mock.MagicMock(side_effect=DNSException) + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "fqdn": "cert.pl"})) + + def test__enrich__with_fqdn_from_url_not_resolved(self): + self.enricher._resolver.query = unittest.mock.MagicMock(side_effect=DNSException) + data = self.enricher.enrich(RecordDict({"url": "http://www.nask.pl/asd"})) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl"})) + + def test__enrich__with_ip_url_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"192.168.0.1": ["asn", "cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_ip_url_given__with_nodns_flag(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_ip_url_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"192.168.0.1": ["asn", "cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "asn": '1234', + "cc": 'PL'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_url_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_fqdn_and_url_given__with_nodns_flag(self): + data = self.enricher.enrich(RecordDict({ + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "_do_not_resolve_fqdn_to_ip": True})) + self.assertFalse(self.enricher._resolver.query.called) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "cert.pl", + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_ip_url_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_given(self): + data = super(TestEnricherWithFullConfig, self).test__enrich__with_address_and_fqdn_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn", "cc"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_given__with_nodns_flag(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_address_and_fqdn_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"10.20.30.40": ["asn", "cc"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_address_and_url_given(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_address_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"10.20.30.40": ["asn", "cc"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_ip_url_given(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_address_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn", "cc"]}), + "url": "http://192.168.0.3/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_and_url_given(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_address_and_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn", "cc"]}), + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_and_ip_url_given(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_address_and_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn", "cc"]}), + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234', + "cc": 'PL'}]})) + + def test__fqdn_to_ip__called(self): + """Test if fqdn_to_ip is called if data does not contain address""" + data = RecordDict({"fqdn": "cert.pl"}) + data.update(self.COMMON_DATA) + self.enricher.fqdn_to_ip = unittest.mock.MagicMock() + self.enricher.enrich(data) + self.enricher.fqdn_to_ip.assert_called_with("cert.pl") + + def test__url_to_fqdn_or_ip__called(self): + """Test if url_to_fqdn_or_ip is called if data does not contain address and fqdn""" + data = RecordDict({"url": "http://www.cert.pl"}) + data.update(self.COMMON_DATA) + self.enricher.url_to_fqdn_or_ip = unittest.mock.MagicMock(return_value="www.cert.pl") + self.enricher.enrich(data) + self.enricher.url_to_fqdn_or_ip.assert_called_with("http://www.cert.pl") + + def test__url_to_fqdn_or_ip__called_for_ip_url(self): + """Test if url_to_fqdn_or_ip is called if data does not contain address and fqdn""" + data = RecordDict({"url": "http://192.168.0.1"}) + data.update(self.COMMON_DATA) + self.enricher.url_to_fqdn_or_ip = unittest.mock.MagicMock(return_value="192.168.0.1") + self.enricher.enrich(data) + self.enricher.url_to_fqdn_or_ip.assert_called_with("http://192.168.0.1") + + def test_adding_asn_cc_if_asn_not_valid_and_cc_is_valid(self): + """Test if asn/cc are (maybe) added""" + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_asn_db_side_effect_if_enabled(GeoIP2Error) + self._set_cc_db_return_value_if_enabled('PL') + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'cc': u'PL', u'ip': u'127.0.0.1'}, + {u'cc': u'PL', u'ip': u'192.187.0.1'}, + {u'cc': u'PL', u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'cc'], + u'127.0.0.1': [u'cc'], + u'192.187.0.1': [u'cc']}), + data_expected["enriched"]) + + def test_adding_asn_cc_if_asn_and_cc_are_valid(self): + """Test if asn/cc are (maybe) added""" + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_asn_db_return_value_if_enabled(1234) + self._set_cc_db_return_value_if_enabled('UK') + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'asn': 1234, u'cc': u'UK', u'ip': u'127.0.0.1'}, + {u'asn': 1234, u'cc': u'UK', u'ip': u'192.187.0.1'}, + {u'asn': 1234, u'cc': u'UK', u'ip': u'10.15.1.255'}], + data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'asn', u'cc'], + u'127.0.0.1': [u'asn', u'cc'], + u'192.187.0.1': [u'asn', u'cc']}), + data_expected["enriched"]) + + def test_adding_asn_cc_if_asn_is_valid_and_cc_is_not(self): + """Test if asn/cc are (maybe) added""" + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_asn_db_return_value_if_enabled(123456) + self._set_cc_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'asn': 123456, u'ip': u'127.0.0.1'}, + {u'asn': 123456, u'ip': u'192.187.0.1'}, + {u'asn': 123456, u'ip': u'10.15.1.255'}], + data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'asn'], + u'127.0.0.1': [u'asn'], + u'192.187.0.1': [u'asn']}), + data_expected["enriched"]) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_asn_cc_always_dropped_and_new_ones_added_if_asn_and_are_not_valid(self, LOGGER_mock): + """Test if already existing asn/cc are removed and new ones are (maybe) added""" + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + self._set_asn_db_side_effect_if_enabled(GeoIP2Error) + self._set_cc_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=4) + self.assertEqual([{u'ip': u'127.0.0.1'}, + {u'ip': u'192.187.0.1'}, + {u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + self.assertEqual(len(LOGGER_mock.warning.mock_calls), self.expected_num_of_warnings) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_asn_cc_always_dropped_and_new_ones_added_if_asn_is_not_valid(self, LOGGER_mock): + """Test if already existing asn/cc are removed and new ones are (maybe) added""" + self._set_asn_db_side_effect_if_enabled(GeoIP2Error) + self._set_cc_db_return_value_if_enabled('PL') + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=4) + self.assertEqual([{u'cc': u'PL', u'ip': u'127.0.0.1'}, + {u'cc': u'PL', u'ip': u'192.187.0.1'}, + {u'cc': u'PL', u'ip': u'10.15.1.255'}], + data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'cc'], + u'127.0.0.1': [u'cc'], + u'192.187.0.1': [u'cc']}), + data_expected["enriched"]) + self.assertEqual( + len(LOGGER_mock.warning.mock_calls), + self.expected_num_of_warnings) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_asn_cc_always_dropped_and_new_ones_added_if_asn_and_cc_are_valid(self, LOGGER_mock): + """Test if already existing asn/cc are removed and new ones are (maybe) added""" + self._set_asn_db_return_value_if_enabled(12345) + self._set_cc_db_return_value_if_enabled('UK') + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=4) + self.assertEqual([{u'asn': 12345, u'cc': u'UK', u'ip': u'127.0.0.1'}, + {u'asn': 12345, u'cc': u'UK', u'ip': u'192.187.0.1'}, + {u'asn': 12345, u'cc': u'UK', u'ip': u'10.15.1.255'}], + data_expected["address"]) + self.assertEqual(([], + {u'10.15.1.255': [u'asn', u'cc'], + u'127.0.0.1': [u'asn', u'cc'], + u'192.187.0.1': [u'asn', u'cc']}), data_expected["enriched"]) + self.assertEqual( + len(LOGGER_mock.warning.mock_calls), + self.expected_num_of_warnings) + + def test__fqdn_to_ip__not_called(self): + """Test if fqdn_to_ip not called if address already present""" + data = RecordDict({ + "address": [{"ip": "127.0.0.1"}, + {"ip": "192.187.0.1"}, + {"ip": "10.15.1.255"}]}) + data.update(self.COMMON_DATA) + self.enricher.fqdn_to_ip = unittest.mock.MagicMock(return_value="127.0.0.1") + self.enricher.enrich(data) + self.assertFalse(self.enricher.fqdn_to_ip.called) + + def test_routing_key_modified(self): + """Test if routing key after enrichement is set to "enriched.*" + when publishing to output queue""" + self.enricher.publish_output = unittest.mock.MagicMock() + data = RecordDict({ + "address": [{"ip": "127.0.0.1"}, + {"ip": "192.187.0.1"}, + {"ip": "10.15.1.255"}]}) + data.update(self.COMMON_DATA) + body = data.get_ready_json() + initial_routing_key = "event.parsed.test.test-source" + properties = None + self.enricher.input_callback(initial_routing_key, body, properties) + args, kwargs = self.enricher.publish_output.call_args + self.assertIn("routing_key", kwargs) + self.assertEqual(kwargs["routing_key"], "event.enriched.test.test-source") + + def test__get_excluded_ips__with_excluded_ips_in_config(self): + self._prepare_config_for_excluded_ips(['1.1.1.1', '2.2.2.2', '3.3.3.3']) + expected = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + result = self.enricher._get_excluded_ips() + self.assertCountEqual(expected, result) + + def test__get_excluded_ips__without_excluded_ips_in_config(self): + self._prepare_config_for_excluded_ips([]) + expected = None + result = self.enricher._get_excluded_ips() + self.assertEqual(expected, result) + + def test__enrich__with_excluded_ips_config__with_some_ip_to_exclude__1(self): + self._prepare_config_for_excluded_ips(['127.0.0.1', '2.2.2.2', '3.3.3.3']) + self.enricher.excluded_ips = self.enricher._get_excluded_ips() + data = self.enricher.enrich(RecordDict({"url": "http://www.nask.pl/asd", + "address": [{'ip': "127.0.0.1"}]})) + # the 'data' field is present, so FQDN will not be resolved + # to IP addresses + self.assertFalse(self.enricher._resolver.query.called) + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl"})) # (note: emptied `address` removed) + + def test__enrich__with_excluded_ips_config__with_some_ip_to_exclude__2(self): + self._prepare_config_for_excluded_ips(['127.0.0.1', '2.2.2.2', '3.3.3.3']) + self.enricher.excluded_ips = self.enricher._get_excluded_ips() + data = self.enricher.enrich(RecordDict({"url": "http://www.nask.pl/asd"})) + self.enricher._resolver.query.assert_called_once_with("www.nask.pl", "A") + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl"})) # (note: emptied `address` removed) + + def test__enrich__with_excluded_ips_config__without_any_ip_to_exclude(self): + data = super(TestEnricherWithFullConfig, + self).test__enrich__with_excluded_ips_config__without_any_ip_to_exclude() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["asn", "cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234', + "cc": 'PL'}]})) + + def test__filter_out_excluded_ips__with_excluded_ips_being_None(self): + self.enricher.excluded_ips = None + data = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': "127.0.0.1"}], + }) + expected = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': "127.0.0.1"}], + }) + ip_to_enr_mock = unittest.mock.MagicMock() + ip_to_enr_expected_calls = [] + self.enricher._filter_out_excluded_ips(data, ip_to_enr_mock) + self.assertEqualIncludingTypes(expected, data) + self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) + + def test__filter_out_excluded_ips__with_no_ip_in_excluded_ips(self): + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + data = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': '1.1.1.5'}, {'ip': '2.1.1.1'}], + }) + expected = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': '1.1.1.5'}, {'ip': '2.1.1.1'}], + }) + ip_to_enr_mock = unittest.mock.MagicMock() + ip_to_enr_expected_calls = [] + self.enricher._filter_out_excluded_ips(data, ip_to_enr_mock) + self.assertEqualIncludingTypes(expected, data) + self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) + + def test__filter_out_excluded_ips__with_ip_in_excluded_ips__1(self): + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + data = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': '1.1.1.1'}, {'ip': '1.1.1.6'}], + }) + expected = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': '1.1.1.6'}], + }) + ip_to_enr_mock = unittest.mock.MagicMock() + ip_to_enr_expected_calls = [ + unittest.mock.call.pop('1.1.1.1', None), + ] + self.enricher._filter_out_excluded_ips(data, ip_to_enr_mock) + self.assertEqualIncludingTypes(expected, data) + self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) + + def test__filter_out_excluded_ips__with_ip_in_excluded_ips__2(self): + self.enricher.excluded_ips = iptools.IpRangeList('1.1.1.1', '2.2.2.2', '3.3.3.3') + data = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [{'ip': '1.1.1.1', 'asn': 1234}], + }) + expected = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [], + }) + ip_to_enr_mock = unittest.mock.MagicMock() + ip_to_enr_expected_calls = [ + unittest.mock.call.pop('1.1.1.1', None), + ] + self.enricher._filter_out_excluded_ips(data, ip_to_enr_mock) + self.assertEqualIncludingTypes(expected, data) + self.assertEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_calls) + + def test__filter_out_excluded_ips__with_range_of_ips(self): + self.enricher.excluded_ips = iptools.IpRangeList('3.0.0.0/8') + data = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [ + { + 'ip': '3.3.3.3', + 'asn': 1234 + }, + { + 'ip': '3.255.255.255', + 'asn': 5632 + }, + { + 'ip': '3.0.0.0', + 'asn': 5631 + }, + { + 'ip': '2.255.255.255', + 'asn': 5632 + }, + ], + }) + expected = RecordDict({ + "url": "http://www.nask.pl/asd", + "address": [ + { + 'ip': '2.255.255.255', + 'asn': 5632, + }, + ], + }) + ip_to_enr_mock = unittest.mock.MagicMock() + ip_to_enr_expected_call_items = [ + unittest.mock.call.pop('3.3.3.3', None), + unittest.mock.call.pop('3.255.255.255', None), + unittest.mock.call.pop('3.0.0.0', None), + ] + self.enricher._filter_out_excluded_ips(data, ip_to_enr_mock) + self.assertEqualIncludingTypes(expected, data) + self.assertCountEqual(ip_to_enr_mock.mock_calls, ip_to_enr_expected_call_items) + + +class TestEnricherNoASNDatabase(_BaseTestEnricher, unittest.TestCase): + + MOCK_CONFIG = { + 'dnshost': '8.8.8.8', + 'dnsport': 53, + 'geoippath': DEFAULT_GEO_IP_DB_PATH, + 'asndatabasefilename': '', + 'citydatabasefilename': DEFAULT_CC_DB_FILENAME, + 'excluded_ips': [], + } + + def test__ip_to_asn__called_or_not(self): + super(TestEnricherNoASNDatabase, self).test__ip_to_asn__called_or_not() + self._assert_geoip_method_not_called(self.enricher.ip_to_asn) + + def test__ip_to_cc__called_or_not(self): + data = super(TestEnricherNoASNDatabase, self).test__ip_to_cc__called_or_not() + self._assert_geoip_method_called(self.enricher.ip_to_cc, data) + + def test__enrich__with_fqdn_given(self): + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["cc", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "cc": 'PL'}]})) + + def test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"1.0.1.1": ["cc", "ip"], + "1.1.1.1": ["cc", "ip"], + "12.11.10.9": ["cc", "ip"], + "127.0.0.1": ["cc", "ip"], + "13.1.2.3": ["cc", "ip"], + "2.2.2.2": ["cc", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '1.0.1.1', # note: *removed IP duplicates* and + "cc": 'PL'}, # *ordered* by IP (textually) + {"ip": '1.1.1.1', + "cc": 'PL'}, + {"ip": '12.11.10.9', + "cc": 'PL'}, + {"ip": '127.0.0.1', + "cc": 'PL'}, + {"ip": '13.1.2.3', + "cc": 'PL'}, + {"ip": '2.2.2.2', + "cc": 'PL'}]})) + + def test__enrich__with_url_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "cc": 'PL'}]})) + + def test__enrich__with_ip_url_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"192.168.0.1": ["cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "cc": 'PL'}]})) + + def test__enrich__with_ip_url_given__with_nodns_flag(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_ip_url_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"192.168.0.1": ["cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "cc": 'PL'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_url_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "cc": 'PL'}]})) + + def test__enrich__with_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["cc", "ip"]}), + "url": "http://192.168.0.1/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_address_and_fqdn_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["cc"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_given__with_nodns_flag(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_address_and_fqdn_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"10.20.30.40": ["cc"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_address_and_url_given(self): + data = super(TestEnricherNoASNDatabase, self).test__enrich__with_address_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"10.20.30.40": ["cc"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_ip_url_given(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_address_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["cc"]}), + "url": "http://192.168.0.3/asd", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_and_url_given(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_address_and_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["cc"]}), + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}]})) + + def test__enrich__with_address_and_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_address_and_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["cc"]}), + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd", + "address": [{"ip": '10.20.30.40', + "cc": 'PL'}]})) + + def test_adding_geoip_data_if_cc_is_valid(self): + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_cc_db_return_value_if_enabled('US') + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'cc': u'US', u'ip': u'127.0.0.1'}, + {u'cc': u'US', u'ip': u'192.187.0.1'}, + {u'cc': u'US', u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'cc'], + u'127.0.0.1': [u'cc'], + u'192.187.0.1': [u'cc']}), + data_expected["enriched"]) + + def test_adding_geoip_data_if_cc_is_not_valid(self): + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_cc_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'ip': u'127.0.0.1'}, + {u'ip': u'192.187.0.1'}, + {u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_geoip_data__drop_and_add_cc__if_cc_is_valid(self, LOGGER_mock): + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + self._set_cc_db_return_value_if_enabled('FR') + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=2) + self.assertEqual([{u'ip': u'127.0.0.1', u'cc': u'FR'}, + {u'ip': u'192.187.0.1', u'cc': u'FR', u'asn': 424242}, + {u'ip': u'10.15.1.255', u'cc': u'FR', u'asn': 434343}], + data_expected["address"]) + self.assertEqual(([], {u'127.0.0.1': [u'cc'], + u'192.187.0.1': [u'cc'], + u'10.15.1.255': [u'cc']}), + data_expected["enriched"]) + self.assertEqual(len(LOGGER_mock.warning.mock_calls), self.expected_num_of_warnings) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_geoip_data__drop_cc__if_cc_is_invalid(self, LOGGER_mock): + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + self._set_cc_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=2) + self.assertEqual([{u'ip': u'127.0.0.1'}, + {u'ip': u'192.187.0.1', u'asn': 424242}, + {u'ip': u'10.15.1.255', u'asn': 434343}], + data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + self.assertEqual(len(LOGGER_mock.warning.mock_calls), self.expected_num_of_warnings) + + def test__enrich__with_excluded_ips_config__without_any_ip_to_exclude(self): + data = super(TestEnricherNoASNDatabase, + self).test__enrich__with_excluded_ips_config__without_any_ip_to_exclude() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["cc", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "cc": 'PL'}]})) + + +class TestEnricherNoCCDatabase(_BaseTestEnricher, unittest.TestCase): + + MOCK_CONFIG = { + 'dnshost': '8.8.8.8', + 'dnsport': 53, + 'geoippath': DEFAULT_GEO_IP_DB_PATH, + 'asndatabasefilename': DEFAULT_ASN_DB_FILENAME, + 'citydatabasefilename': '', + 'excluded_ips': [], + } + + def test__ip_to_asn__called_or_not(self): + data = super(TestEnricherNoCCDatabase, self).test__ip_to_asn__called_or_not() + self._assert_geoip_method_called(self.enricher.ip_to_asn, data) + + def test__ip_to_cc__called_or_not(self): + super(TestEnricherNoCCDatabase, self).test__ip_to_cc__called_or_not() + self._assert_geoip_method_not_called(self.enricher.ip_to_cc) + + def test__enrich__with_fqdn_given(self): + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234'}]})) + + def test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"1.0.1.1": ["asn", "ip"], + "1.1.1.1": ["asn", "ip"], + "12.11.10.9": ["asn", "ip"], + "127.0.0.1": ["asn", "ip"], + "13.1.2.3": ["asn", "ip"], + "2.2.2.2": ["asn", "ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '1.0.1.1', # note: *removed IP duplicates* and + "asn": '1234'}, # *ordered* by IP (textually) + {"ip": '1.1.1.1', + "asn": '1234'}, + {"ip": '12.11.10.9', + "asn": '1234'}, + {"ip": '127.0.0.1', + "asn": '1234'}, + {"ip": '13.1.2.3', + "asn": '1234'}, + {"ip": '2.2.2.2', + "asn": '1234'}]})) + + def test__enrich__with_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["asn", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234'}]})) + + def test__enrich__with_ip_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"192.168.0.1": ["asn", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "asn": '1234'}]})) + + def test__enrich__with_ip_url_given__with_nodns_flag(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_ip_url_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"192.168.0.1": ["asn", "ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1', + "asn": '1234'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234'}]})) + + def test__enrich__with_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["asn", "ip"]}), + "url": "http://192.168.0.1/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234'}]})) + + def test__enrich__with_address_and_fqdn_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_address_and_fqdn_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}]})) + + def test__enrich__with_address_and_fqdn_given__with_nodns_flag(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_address_and_fqdn_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"10.20.30.40": ["asn"]}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_address_and_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_address_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"10.20.30.40": ["asn"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}]})) + + def test__enrich__with_address_and_ip_url_given(self): + data = super(TestEnricherNoCCDatabase, self).test__enrich__with_address_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn"]}), + "url": "http://192.168.0.3/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}]})) + + def test__enrich__with_address_and_fqdn_and_url_given(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_address_and_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn"]}), + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}]})) + + def test__enrich__with_address_and_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_address_and_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"10.20.30.40": ["asn"]}), + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd", + "address": [{"ip": '10.20.30.40', + "asn": '1234'}]})) + + def test_adding_geoip_data_if_asn_is_valid(self): + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_asn_db_return_value_if_enabled(45678) + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'asn': 45678, u'ip': u'127.0.0.1'}, + {u'asn': 45678, u'ip': u'192.187.0.1'}, + {u'asn': 45678, u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {u'10.15.1.255': [u'asn'], + u'127.0.0.1': [u'asn'], + u'192.187.0.1': [u'asn']}), + data_expected["enriched"]) + + def test_adding_geoip_data_if_asn_is_not_valid(self): + data_init = self._get_actual_data_for_adding_asn_cc_if_possible() + self._set_asn_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'ip': u'127.0.0.1'}, + {u'ip': u'192.187.0.1'}, + {u'ip': u'10.15.1.255'}], data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_geoip_data__drop_and_add_asn__if_asn_is_valid(self, LOGGER_mock): + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + self._set_asn_db_return_value_if_enabled(456789) + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=2) + self.assertEqual([{u'ip': u'127.0.0.1', u'cc': u'JP', u'asn': 456789}, + {u'ip': u'192.187.0.1', u'cc': u'US', u'asn': 456789}, + {u'ip': u'10.15.1.255', u'asn': 456789}], + data_expected["address"]) + self.assertEqual(([], {u'127.0.0.1': [u'asn'], + u'192.187.0.1': [u'asn'], + u'10.15.1.255': [u'asn']}), + data_expected["enriched"]) + self.assertEqual(len(LOGGER_mock.warning.mock_calls), self.expected_num_of_warnings) + + @unittest.mock.patch('n6datapipeline.enrich.LOGGER') + def test_existing_geoip_data__drop_asn__if_asn_is_invalid(self, LOGGER_mock): + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + self._set_asn_db_side_effect_if_enabled(GeoIP2Error) + data_expected = self._enricher_execution_helper(data_init, expected_num_of_warnings=2) + self.assertEqual([{u'ip': u'127.0.0.1', u'cc': 'JP'}, + {u'ip': u'192.187.0.1', u'cc': 'US'}, + {u'ip': u'10.15.1.255'}], + data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + self.assertEqual(len(LOGGER_mock.warning.mock_calls), self.expected_num_of_warnings) + + def test__enrich__with_excluded_ips_config__without_any_ip_to_exclude(self): + data = super(TestEnricherNoCCDatabase, + self).test__enrich__with_excluded_ips_config__without_any_ip_to_exclude() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["asn", "ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1', + "asn": '1234'}]})) + + +class TestEnricherNoGeoIPDatabase(_BaseTestEnricher, unittest.TestCase): + + MOCK_CONFIG = { + 'dnshost': '8.8.8.8', + 'dnsport': 53, + 'geoippath': '', + 'asndatabasefilename': '', + 'citydatabasefilename': '', + 'excluded_ips': [], + } + + def test__ip_to_asn__called_or_not(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__ip_to_asn__called_or_not() + self._assert_geoip_method_not_called(self.enricher.ip_to_asn) + + def test__ip_to_cc__called_or_not(self): + super(TestEnricherNoGeoIPDatabase, self).test__ip_to_cc__called_or_not() + self._assert_geoip_method_not_called(self.enricher.ip_to_cc) + + def test__enrich__with_fqdn_given(self): + data = self.enricher.enrich(RecordDict({"fqdn": "cert.pl"})) + self.enricher._resolver.query.assert_called_once_with("cert.pl", "A") + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1'}]})) + + def test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_fqdn_given__resolved_to_various_ips_with_duplicates() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"1.0.1.1": ["ip"], + "1.1.1.1": ["ip"], + "12.11.10.9": ["ip"], + "127.0.0.1": ["ip"], + "13.1.2.3": ["ip"], + "2.2.2.2": ["ip"]}), + "fqdn": "cert.pl", + "address": [{"ip": '1.0.1.1'}, # note: *removed IP duplicates* and + {"ip": '1.1.1.1'}, # *ordered* by IP (textually) + {"ip": '12.11.10.9'}, + {"ip": '127.0.0.1'}, + {"ip": '13.1.2.3'}, + {"ip": '2.2.2.2'}]})) + + def test__enrich__with_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1'}]})) + + def test__enrich__with_ip_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"192.168.0.1": ["ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1'}]})) + + def test__enrich__with_ip_url_given__with_nodns_flag(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_ip_url_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {"192.168.0.1": ["ip"]}), + "url": "http://192.168.0.1/asd", + "address": [{"ip": '192.168.0.1'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_fqdn_and_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1'}]})) + + def test__enrich__with_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {"127.0.0.1": ["ip"]}), + "url": "http://192.168.0.1/asd", + "fqdn": "cert.pl", + "address": [{"ip": '127.0.0.1'}]})) + + def test__enrich__with_address_and_fqdn_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_address_and_fqdn_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40'}]})) + + def test__enrich__with_address_and_fqdn_given__with_nodns_flag(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_address_and_fqdn_given__with_nodns_flag() + self.assertEqualIncludingTypes(data, RecordDict({ + # (here the '_do_not_resolve_fqdn_to_ip' flag did *not* change behaviour) + "enriched": ([], {}), + "fqdn": "cert.pl", + "address": [{"ip": '10.20.30.40'}], + "_do_not_resolve_fqdn_to_ip": True})) + + def test__enrich__with_address_and_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_address_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '10.20.30.40'}]})) + + def test__enrich__with_address_and_ip_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, self).test__enrich__with_address_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "url": "http://192.168.0.3/asd", + "address": [{"ip": '10.20.30.40'}]})) + + def test__enrich__with_address_and_fqdn_and_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_address_and_fqdn_and_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "fqdn": "cert.pl", + "url": "http://www.nask.pl/asd", + "address": [{"ip": '10.20.30.40'}]})) + + def test__enrich__with_address_and_fqdn_and_ip_url_given(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_address_and_fqdn_and_ip_url_given() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": ([], {}), + "fqdn": "cert.pl", + "url": "http://192.168.0.1/asd", + "address": [{"ip": '10.20.30.40'}]})) + + def test_existing_geoip_data__drop_and_add_asn__if_asn_is_valid(self): + # no additional GeoIP data should be added and existing ASN/CC + # values should not be dropped + data_init = self._get_actual_data_for_existing_asn_cc_always_dropped_and_new_ones_added_if_possible() + data_expected = self._enricher_execution_helper(data_init) + self.assertEqual([{u'ip': u'127.0.0.1', u'cc': u'JP'}, + {u'ip': u'192.187.0.1', u'cc': u'US', u'asn': 424242}, + {u'ip': u'10.15.1.255', u'asn': 434343}], + data_expected["address"]) + self.assertEqual(([], {}), data_expected["enriched"]) + + def test__enrich__with_excluded_ips_config__without_any_ip_to_exclude(self): + data = super(TestEnricherNoGeoIPDatabase, + self).test__enrich__with_excluded_ips_config__without_any_ip_to_exclude() + self.assertEqualIncludingTypes(data, RecordDict({ + "enriched": (["fqdn"], {"127.0.0.1": ["ip"]}), + "url": "http://www.nask.pl/asd", + "fqdn": "www.nask.pl", + "address": [{"ip": '127.0.0.1'}]})) diff --git a/N6DataPipeline/setup.py b/N6DataPipeline/setup.py index 5bf02a4..29cf6fe 100644 --- a/N6DataPipeline/setup.py +++ b/N6DataPipeline/setup.py @@ -56,7 +56,7 @@ def list_console_scripts(): n6_version = get_n6_version('.n6-version') -requirements = ['n6sdk==' + n6_version, 'n6lib==' + n6_version, 'intelmq'] +requirements = ['n6sdk==' + n6_version, 'n6lib==' + n6_version, 'intelmq', 'iptools==0.7.0'] console_scripts = list_console_scripts() setup( diff --git a/N6DataSources/console_scripts b/N6DataSources/console_scripts index 232befc..51eb235 100644 --- a/N6DataSources/console_scripts +++ b/N6DataSources/console_scripts @@ -1,2 +1,5 @@ -#n6collector_xxx = n6datasources.collectors.xxx:main -#n6parser_xxx = n6datasources.parsers.xxx:main +# collectors +n6collector_abusechfeodotracker = n6datasources.collectors.abuse_ch:AbuseChFeodoTrackerCollector_main + +# parsers +n6parser_abusechfeodotracker202110 = n6datasources.parsers.abuse_ch:AbuseChFeodoTracker202110Parser_main diff --git a/N6DataSources/n6datasources/collectors/__init__.py b/N6DataSources/n6datasources/collectors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/N6DataSources/n6datasources/collectors/abuse_ch.py b/N6DataSources/n6datasources/collectors/abuse_ch.py new file mode 100644 index 0000000..d91cdcf --- /dev/null +++ b/N6DataSources/n6datasources/collectors/abuse_ch.py @@ -0,0 +1,77 @@ +# Copyright (c) 2013-2021 NASK. All rights reserved. + +""" +Collectors: abuse-ch.feodotracker (TODO: other `abuse-ch` collectors...). +""" + +from n6datasources.collectors.base import ( + BaseDownloadingTimeOrderedRowsCollector, + add_collector_entry_point_functions, +) +from n6lib.csv_helpers import extract_field_from_csv_row +from n6lib.datetime_helpers import parse_iso_datetime_to_utc +from n6lib.log_helpers import get_logger + + +LOGGER = get_logger(__name__) + + +class _BaseAbuseChDownloadingTimeOrderedRowsCollector(BaseDownloadingTimeOrderedRowsCollector): + + pickle_protocol = 2 # (for interoperability with the Py2 version) + + row_time_legacy_state_key = None + time_field_index = None + + @property + def source_config_section(self): + return 'abusech_{}'.format(self.get_source_channel().replace('-', '_')) + + def load_state(self): + state = super().load_state() + if self.row_time_legacy_state_key and self.row_time_legacy_state_key in state: + # got `state` in a legacy form + row_time = self.normalize_row_time(state[self.row_time_legacy_state_key]) + state = { + # note: one or a few rows (those containing this "boundary" + # time value) will be duplicated, but we can live with that + self._NEWEST_ROW_TIME_STATE_KEY: row_time, + self._NEWEST_ROWS_STATE_KEY: set(), + } + return state + + def pick_raw_row_time(self, row): + return extract_field_from_csv_row(row, column_index=self.time_field_index).strip() + + def clean_row_time(self, raw_row_time): + return self.normalize_row_time(raw_row_time) + + def normalize_row_time(self, raw_row_time): + return str(parse_iso_datetime_to_utc(raw_row_time)) + + +class AbuseChFeodoTrackerCollector(_BaseAbuseChDownloadingTimeOrderedRowsCollector): + + raw_format_version_tag = '202110' + + time_field_index = 0 + + def get_source_channel(self, **processed_data): + return 'feodotracker' + + def all_rows_from_orig_data(self, orig_data): + all_rows = super().all_rows_from_orig_data(orig_data) + return reversed(all_rows) + + def should_row_be_used(self, row): + if not row.strip() or row.startswith('#'): + return False + try: + raw_row_time = extract_field_from_csv_row(row, column_index=self.time_field_index) + self.normalize_row_time(raw_row_time) + return True + except ValueError: + return False + + +add_collector_entry_point_functions(__name__) diff --git a/N6DataSources/n6datasources/collectors/base.py b/N6DataSources/n6datasources/collectors/base.py new file mode 100644 index 0000000..e983edf --- /dev/null +++ b/N6DataSources/n6datasources/collectors/base.py @@ -0,0 +1,1153 @@ +# Copyright (c) 2013-2021 NASK. All rights reserved. + +""" +Collector base classes + auxiliary tools. +""" + +import contextlib +import datetime +import pickle +import hashlib +import os +import sys +import time +from math import trunc +from typing import Optional + +import requests + +from n6lib.config import ( + ConfigError, + ConfigMixin, + ConfigSection, +) +from n6datapipeline.base import LegacyQueuedBase +from n6lib.class_helpers import attr_required +from n6lib.common_helpers import ( + AtomicallySavedFile, + as_bytes, + make_exc_ascii_str, +) +from n6lib.http_helpers import RequestPerformer +from n6lib.log_helpers import ( + get_logger, + logging_configured, +) +from n6lib.typing_helpers import KwargsDict + + +LOGGER = get_logger(__name__) + + +# +# Mixin classes + +class CollectorConfigMixin(ConfigMixin): + + def get_config_spec_format_kwargs(self): + return {} + + def set_configuration(self): + if self.is_config_spec_or_group_declared(): + self.config = self.get_config_section(**self.get_config_spec_format_kwargs()) + else: + # backward-compatible behavior needed by a few collectors + # that have `config_group = None` and -- at the same + # time -- no `config_spec`/`config_spec_pattern` + self.config = ConfigSection('') + + +class CollectorWithStateMixin(object): + + """ + Mixin for tracking state of an inheriting collector. + + Any picklable object can be saved as a state and then be retrieved + as an object of the same type. + """ + + pickle_protocol = pickle.HIGHEST_PROTOCOL + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._cache_file_path = os.path.join(os.path.expanduser( + self.config['cache_dir']), self.get_cache_file_name()) + + def load_state(self): + """ + Load collector's state from cache. + + Returns: + Unpickled object of its original type. + """ + try: + with open(self._cache_file_path, 'rb') as cache_file: + state = pickle.load(cache_file) + except (OSError, ValueError, EOFError) as exc: + state = self.make_default_state() + LOGGER.warning( + "Could not load state (%s), returning: %r", + make_exc_ascii_str(exc), + state) + else: + LOGGER.info("Loaded state: %r", state) + return state + + def save_state(self, state): + """ + Save any picklable object as a collector's state. + + Args: + `state`: a picklable object. + """ + cache_dir = os.path.dirname(self._cache_file_path) + try: + os.makedirs(cache_dir, 0o700) + except OSError: + pass + + with AtomicallySavedFile(self._cache_file_path, 'wb') as f: + pickle.dump(state, f, self.pickle_protocol) + LOGGER.info("Saved state: %r", state) + + def get_cache_file_name(self): + source_channel = self.get_source_channel() + source = self.get_source(source_channel=source_channel) + return '{}.{}.pickle'.format(source, self.__class__.__name__) + + def make_default_state(self): + return None + + +# +# Base classes + +class AbstractBaseCollector(object): + + """ + Abstract base class for a collector script implementations. + """ + + @classmethod + def run_script(cls): + with logging_configured(): + init_kwargs = cls.get_script_init_kwargs() + collector = cls(**init_kwargs) # noqa + collector.run_handling() + + @classmethod + def get_script_init_kwargs(cls): + """ + A class method: get a dict of kwargs for instantiation in a script. + + The default implementation returns an empty dict. + """ + return {} + + # + # Permanent (daemon-like) processing + + def run_handling(self): + """ + Run the event loop until Ctrl+C is pressed. + """ + try: + self.run() + except KeyboardInterrupt: + self.stop() + + # + # Abstract methods (must be overridden) + + def run(self): + raise NotImplementedError + + def stop(self): + raise NotImplementedError + + +class BaseCollector(CollectorConfigMixin, LegacyQueuedBase, AbstractBaseCollector): + + """ + The standard "root" base class for collectors. + """ + + output_queue = { + 'exchange': 'raw', + 'exchange_type': 'topic', + } + + # None or a string being the tag of the raw data format version + # (can be set in a subclass) + raw_format_version_tag = None + + # the name of the config group + # (it does not have to be implemented if one of the `config_spec` + # or the `config_spec_pattern` attribute is set in a subclass, + # containing a declaration of exactly *one* config section) + config_group = None + + # a sequence of required config fields (can be extended in + # subclasses; typically, 'source' should be included there!) + config_required = ('source',) + # (NOTE: the `source` setting value in the config is only + # the first part -- the `label` part -- of the actual + # source specification string '