From c0b7fe41a1939204edf67c2aef761372cd92841f Mon Sep 17 00:00:00 2001 From: Jason Jorgensen Date: Fri, 13 May 2022 12:17:38 -0500 Subject: [PATCH] adding an alternative Datadog metric format 'servicetags' to limit the number of metrics that are created and make monitoring them easier. minor fixes so docs could build --- awslimitchecker/metrics/datadog.py | 75 ++++++++++-- awslimitchecker/tests/metrics/test_datadog.py | 115 ++++++++++++++++++ docs/source/cli_usage.rst | 46 +++++-- docs/source/cli_usage.rst.template | 20 +++ docs/source/getting_started.rst | 2 +- docs/source/limits.rst | 4 +- 6 files changed, 235 insertions(+), 27 deletions(-) diff --git a/awslimitchecker/metrics/datadog.py b/awslimitchecker/metrics/datadog.py index c2b5fc3a..b110b64b 100644 --- a/awslimitchecker/metrics/datadog.py +++ b/awslimitchecker/metrics/datadog.py @@ -53,7 +53,8 @@ class Datadog(MetricsProvider): def __init__( self, region_name, prefix='awslimitchecker.', api_key=None, - extra_tags=None, host='https://api.datadoghq.com' + extra_tags=None, host='https://api.datadoghq.com', + metric_format='original' ): """ Initialize the Datadog metrics provider. This class does not have any @@ -76,6 +77,15 @@ def __init__( :param extra_tags: CSV list of additional tags to send with metrics. All metrics will automatically be tagged with ``region:`` :type extra_tags: str + :param metric_format: ``original`` default format has a metric name + `awslimitchecker.s3.buckets.limit` per service limit `S3 / Buckets`. + ``servicetags`` format uses only 2 metric names + `awslimitchecker.limit` and `awslimitchecker.max_usage` with the + limit as a tag, `awslimitchecker.limit ['service_limit:s3.buckets']`. + This can be used with a single monitor with a single query + `avg(last_4h):avg:awslimitchecker.max_usage{*} by {service_limit} / + avg:awslimitchecker.limit{*} by {service_limit} * 100 > 95` + :type metric_format: str """ super(Datadog, self).__init__(region_name) self._prefix = prefix @@ -84,6 +94,7 @@ def __init__( self._tags.extend(extra_tags.split(',')) self._api_key = os.environ.get('DATADOG_API_KEY') self._host = os.environ.get('DATADOG_HOST', host) + self._metric_format = metric_format if api_key is not None: self._api_key = api_key if self._api_key is None: @@ -122,6 +133,19 @@ def _name_for_metric(self, service, limit): re.sub(r'[^0-9a-zA-Z]+', '_', limit) )).lower() + def _name_for_datadog(self, name): + """ + Return a name that's safe for datadog + + :param name: service or limit or other name + :type service: str + :return: datadog safe name + :rtype: str + """ + return ('%s' % ( + re.sub(r'[^0-9a-zA-Z]+', '_', name) + )).lower() + def flush(self): ts = int(time.time()) logger.debug('Flushing metrics to Datadog.') @@ -137,21 +161,50 @@ def flush(self): max_usage = 0 else: max_usage = max(u).get_value() - mname = self._name_for_metric(lim.service.service_name, lim.name) - series.append({ - 'metric': '%s.max_usage' % mname, - 'points': [[ts, max_usage]], - 'type': 'gauge', - 'tags': self._tags - }) limit = lim.get_limit() - if limit is not None: + + if self._metric_format == 'original': + mname = self._name_for_metric( + lim.service.service_name, lim.name) series.append({ - 'metric': '%s.limit' % mname, - 'points': [[ts, limit]], + 'metric': '%s.max_usage' % mname, + 'points': [[ts, max_usage]], 'type': 'gauge', 'tags': self._tags }) + if limit is not None: + series.append({ + 'metric': '%s.limit' % mname, + 'points': [[ts, limit]], + 'type': 'gauge', + 'tags': self._tags + }) + elif self._metric_format == 'servicetags': + mtags = self._tags.copy() + mtags.extend(['service:%s' % + self._name_for_datadog(lim.service.service_name)]) + mtags.extend(['service_limit:%s.%s' % + (self._name_for_datadog(lim.service.service_name), + self._name_for_datadog(lim.name))]) + series.append({ + 'metric': '%smax_usage' % self._prefix, + 'points': [[ts, max_usage]], + 'type': 'gauge', + 'tags': mtags + }) + limit = lim.get_limit() + if limit is not None: + series.append({ + 'metric': '%slimit' % self._prefix, + 'points': [[ts, limit]], + 'type': 'gauge', + 'tags': mtags + }) + else: + raise RuntimeError( + "ERROR: Datadog metric provider metric_format must " + "be 'original' or 'servicetags'." + ) logger.info('POSTing %d metrics to datadog', len(series)) data = {'series': series} encoded = json.dumps(data).encode('utf-8') diff --git a/awslimitchecker/tests/metrics/test_datadog.py b/awslimitchecker/tests/metrics/test_datadog.py index 77ebf592..09c7ef93 100644 --- a/awslimitchecker/tests/metrics/test_datadog.py +++ b/awslimitchecker/tests/metrics/test_datadog.py @@ -171,6 +171,7 @@ def setup(self): m_init.return_value = None self.cls = Datadog() self.cls._host = 'https://api.datadoghq.com' + self.cls._metric_format = 'original' class TestValidateAuth(DatadogTester): @@ -228,6 +229,14 @@ def test_simple(self): ) == 'foobar.service_name_.limit_name_' +class TestSafeName(DatadogTester): + + def test_simple(self): + assert self.cls._name_for_datadog( + 'limit* NAME .' + ) == 'limit_name_' + + class TestFlush(DatadogTester): @freeze_time("2016-12-16 10:40:42", tz_offset=0, auto_tick_seconds=6) @@ -297,6 +306,112 @@ def test_happy_path(self): assert c[2]['headers'] == {'Content-type': 'application/json'} assert json.loads(c[2]['body'].decode()) == expected + @freeze_time("2016-12-16 10:40:42", tz_offset=0, auto_tick_seconds=6) + def test_servicetags_format(self): + self.cls._prefix = 'prefix.' + self.cls._tags = ['tag1', 'tag:2'] + self.cls._limits = [] + self.cls._api_key = 'myKey' + self.cls._metric_format = 'servicetags' + self.cls.set_run_duration(123.45) + limA = Mock( + name='limitA', service=Mock(service_name='SVC1') + ) + type(limA).name = 'limitA' + limA.get_current_usage.return_value = [] + limA.get_limit.return_value = None + self.cls.add_limit(limA) + limB = Mock( + name='limitB', service=Mock(service_name='SVC1') + ) + type(limB).name = 'limitB' + mocku = Mock() + mocku.get_value.return_value = 6 + limB.get_current_usage.return_value = [mocku] + limB.get_limit.return_value = 10 + self.cls.add_limit(limB) + mock_http = Mock() + mock_resp = Mock(status=200, data='{"status": "ok"}') + mock_http.request.return_value = mock_resp + self.cls._http = mock_http + self.cls.flush() + ts = 1481884842 + expected = { + 'series': [ + { + 'metric': 'prefix.runtime', + 'points': [[ts, 123.45]], + 'type': 'gauge', + 'tags': ['tag1', 'tag:2'] + }, + { + 'metric': 'prefix.max_usage', + 'points': [[ts, 0]], + 'type': 'gauge', + 'tags': ['tag1', 'tag:2', + 'service:svc1', 'service_limit:svc1.limita'] + }, + { + 'metric': 'prefix.max_usage', + 'points': [[ts, 6]], + 'type': 'gauge', + 'tags': ['tag1', 'tag:2', + 'service:svc1', 'service_limit:svc1.limitb'] + }, + { + 'metric': 'prefix.limit', + 'points': [[ts, 10]], + 'type': 'gauge', + 'tags': ['tag1', 'tag:2', + 'service:svc1', 'service_limit:svc1.limitb'] + } + ] + } + assert len(mock_http.mock_calls) == 1 + c = mock_http.mock_calls[0] + assert c[0] == 'request' + assert c[1] == ( + 'POST', 'https://api.datadoghq.com/api/v1/series?api_key=myKey' + ) + assert len(c[2]) == 2 + assert c[2]['headers'] == {'Content-type': 'application/json'} + assert json.loads(c[2]['body'].decode()) == expected + + @freeze_time("2016-12-16 10:40:42", tz_offset=0, auto_tick_seconds=6) + def test_invalid_format(self): + self.cls._prefix = 'prefix.' + self.cls._tags = ['tag1', 'tag:2'] + self.cls._limits = [] + self.cls._api_key = 'myKey' + self.cls._metric_format = 'invalidformat' + self.cls.set_run_duration(123.45) + limA = Mock( + name='limitA', service=Mock(service_name='SVC1') + ) + type(limA).name = 'limitA' + limA.get_current_usage.return_value = [] + limA.get_limit.return_value = None + self.cls.add_limit(limA) + limB = Mock( + name='limitB', service=Mock(service_name='SVC1') + ) + type(limB).name = 'limitB' + mocku = Mock() + mocku.get_value.return_value = 6 + limB.get_current_usage.return_value = [mocku] + limB.get_limit.return_value = 10 + self.cls.add_limit(limB) + mock_http = Mock() + mock_resp = Mock(status=200, data='{"status": "ok"}') + mock_http.request.return_value = mock_resp + self.cls._http = mock_http + with pytest.raises(RuntimeError) as exc: + self.cls.flush() + assert str(exc.value) == "ERROR: Datadog metric provider " \ + "metric_format must be " \ + "'original' or 'servicetags'." + assert len(mock_http.mock_calls) == 0 + @freeze_time("2016-12-16 10:40:42", tz_offset=0, auto_tick_seconds=6) def test_api_error_non_default_host(self): self.cls._prefix = 'prefix.' diff --git a/docs/source/cli_usage.rst b/docs/source/cli_usage.rst index eb8d752b..7362f809 100644 --- a/docs/source/cli_usage.rst +++ b/docs/source/cli_usage.rst @@ -222,13 +222,13 @@ and limits followed by ``(API)`` have been obtained from the service's API. .. code-block:: console (venv)$ awslimitchecker -l - ApiGateway/API keys per account 500.0 (Quotas) + ApiGateway/API keys per account 10000.0 (Quotas) ApiGateway/Client certificates per account 60.0 (Quotas) ApiGateway/Custom authorizers per API 10 ApiGateway/Documentation parts per API 2000 ApiGateway/Edge APIs per account 120.0 (Quotas) (...) - AutoScaling/Auto Scaling groups 200 (API) + AutoScaling/Auto Scaling groups 500 (API) (...) Lambda/Function Count None (...) @@ -253,7 +253,7 @@ from the Service Quotas service. ApiGateway/Documentation parts per API 2000 ApiGateway/Edge APIs per account 120 (...) - AutoScaling/Auto Scaling groups 200 (API) + AutoScaling/Auto Scaling groups 500 (API) (...) Lambda/Function Count None (...) @@ -275,13 +275,13 @@ from Trusted Advisor for all commands. .. code-block:: console (venv)$ awslimitchecker -l --skip-ta - ApiGateway/API keys per account 500.0 (Quotas) + ApiGateway/API keys per account 10000.0 (Quotas) ApiGateway/Client certificates per account 60.0 (Quotas) ApiGateway/Custom authorizers per API 10 ApiGateway/Documentation parts per API 2000 ApiGateway/Edge APIs per account 120.0 (Quotas) (...) - AutoScaling/Auto Scaling groups 200 (API) + AutoScaling/Auto Scaling groups 500 (API) (...) Lambda/Function Count None (...) @@ -344,15 +344,15 @@ using their IDs). .. code-block:: console (venv)$ awslimitchecker -u - ApiGateway/API keys per account 2 + ApiGateway/API keys per account 0 ApiGateway/Client certificates per account 0 - ApiGateway/Custom authorizers per API max: 2d7q4kzcmh=2 (2d7q4kz (...) - ApiGateway/Documentation parts per API max: 2d7q4kzcmh=2 (2d7q4kz (...) - ApiGateway/Edge APIs per account 9 + ApiGateway/Custom authorizers per API + ApiGateway/Documentation parts per API + ApiGateway/Edge APIs per account 0 (...) - VPC/Subnets per VPC max: vpc-f4279a92=6 (vpc-f (...) + VPC/Subnets per VPC max: vpc-02031d86da0b6d120 (...) VPC/VPCs 2 - VPC/Virtual private gateways 1 + VPC/Virtual private gateways 0 @@ -377,7 +377,7 @@ For example, to override the limits of EC2's "EC2-Classic Elastic IPs" and .. code-block:: console (venv)$ awslimitchecker -L "AutoScaling/Auto Scaling groups"=321 --limit="AutoScaling/Launch configurations"=456 -l - ApiGateway/API keys per account 500.0 (Quotas) + ApiGateway/API keys per account 10000.0 (Quotas) ApiGateway/Client certificates per account 60.0 (Quotas) ApiGateway/Custom authorizers per API 10 ApiGateway/Documentation parts per API 2000 @@ -412,7 +412,7 @@ Using a command like: .. code-block:: console (venv)$ awslimitchecker --limit-override-json=limit_overrides.json -l - ApiGateway/API keys per account 500.0 (Quotas) + ApiGateway/API keys per account 10000.0 (Quotas) ApiGateway/Client certificates per account 60.0 (Quotas) ApiGateway/Custom authorizers per API 10 ApiGateway/Documentation parts per API 2000 @@ -574,6 +574,26 @@ environment variable) and an optional ``extra_tags`` parameter: Metrics will be pushed to the provider only when awslimitchecker is done checking all limits. +There is also an alternative metric format for +:py:class:`~awslimitchecker.metrics.datadog.Datadog` metrics provider which +uses only two metrics ``awslimitchecker.limit`` and +``awslimitchecker.max_usage``. All service limits are added as tags to these +metrics. + +To use this alternative format add optional parameter ``metric_format=servicetags``: + +.. code-block:: console + + (venv)$ awslimitchecker \ + --metrics-provider=Datadog \ + --metrics-config=api_key=123456 \ + --metrics-config=extra_tags=foo,bar,baz:blam \ + --metrics-config=metric_format=servicetags + +You can use the following query with one Datadog monitor for all service limits. +``avg(last_4h):avg:awslimitchecker.max_usage{*} by {service_limit} / +avg:awslimitchecker.limit{*} by {service_limit} * 100 > 95`` + .. _cli_usage.alerts: Enable Alerts Provider diff --git a/docs/source/cli_usage.rst.template b/docs/source/cli_usage.rst.template index 38a0a5bf..657b9fb5 100644 --- a/docs/source/cli_usage.rst.template +++ b/docs/source/cli_usage.rst.template @@ -273,6 +273,26 @@ environment variable) and an optional ``extra_tags`` parameter: Metrics will be pushed to the provider only when awslimitchecker is done checking all limits. +There is also an alternative metric format for +:py:class:`~awslimitchecker.metrics.datadog.Datadog` metrics provider which +uses only two metrics ``awslimitchecker.limit`` and +``awslimitchecker.max_usage``. All service limits are added as tags to these +metrics. + +To use this alternative format add optional parameter ``metric_format=servicetags``: + +.. code-block:: console + + (venv)$ awslimitchecker \ + --metrics-provider=Datadog \ + --metrics-config=api_key=123456 \ + --metrics-config=extra_tags=foo,bar,baz:blam \ + --metrics-config=metric_format=servicetags + +You can use the following query with one Datadog monitor for all service limits. +``avg(last_4h):avg:awslimitchecker.max_usage{*} by {service_limit} / +avg:awslimitchecker.limit{*} by {service_limit} * 100 > 95`` + .. _cli_usage.alerts: Enable Alerts Provider diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 7aa10b31..d2aaa403 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -179,7 +179,7 @@ Trusted Advisor awslimitchecker supports retrieving your current service limits via the `Trusted Advisor `_ -`"Service Limits" performance check `_ +`"Service Limits" performance check `_ , for limits which Trusted Advisor tracks (currently a subset of what awslimitchecker knows about). The results of this check may not be available via the API for all accounts; as of December 2016, the Trusted Advisor documentation states that while diff --git a/docs/source/limits.rst b/docs/source/limits.rst index 76b10121..82dff6dc 100644 --- a/docs/source/limits.rst +++ b/docs/source/limits.rst @@ -222,7 +222,7 @@ type. Limit Trusted Advisor Quotas API Default ==================================================================== =============== ======== ======= ==== All F Spot Instance Requests |check| 11 -All G Spot Instance Requests |check| 11 +All G Spot Instance Requests 11 All Inf Spot Instance Requests |check| 64 All P Spot Instance Requests |check| 16 All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests |check| 1440 @@ -234,7 +234,7 @@ Max target capacity for all spot fleets in region Max target capacity per spot fleet 3000 Rules per VPC security group |check| 60 Running On-Demand All F instances |check| 128 -Running On-Demand All G instances |check| 128 +Running On-Demand All G instances 128 Running On-Demand All P instances |check| 128 Running On-Demand All Standard (A, C, D, H, I, M, R, T, Z) instances |check| 1152 Running On-Demand All X instances |check| 128