Skip to content

Commit

Permalink
Health and Protection
Browse files Browse the repository at this point in the history
  • Loading branch information
tremble committed Oct 7, 2024
1 parent 49ee575 commit 6d534b0
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 31 deletions.
78 changes: 65 additions & 13 deletions plugins/module_utils/_autoscaling/waiters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,36 @@

from ..waiter import BaseWaiterFactory

WAITER_MAP = {
"Standby": "instances_in_standby",
"Terminated": "instances_terminated",
"Detached": "instances_detached",
"InService": "instances_in_service",
"HEALTHY": "instances_healthy",
"Healthy": "instances_healthy",
"UNHEALTHY": "instances_unhealthy",
"Unhealthy": "instances_unhealthy",
"Protected": "instances_protected",
"NotProtected": "instances_not_protected",
}


def _fail_on_instance_lifecycle_states(state):
return dict(state="failure", matcher="pathAny", expected=state, argument="AutoScalingInstances[].LifecycleState")


def _retry_on_instance_lifecycle_states(state):
return dict(state="retry", matcher="pathAny", expected=state, argument="AutoScalingInstances[].LifecycleState")


def _success_on_instance_lifecycle_states(state):
return dict(state="success", matcher="pathAll", expected=state, argument="AutoScalingInstances[].LifecycleState")


def _success_on_instance_health(health):
return dict(state="success", matcher="pathAll", expected=health, argument="AutoScalingInstances[].HealthStatus")


def _success_on_instance_protection(state):
return dict(state="success", matcher="pathAll", expected=state, argument="AutoScalingInstances[].ProtectedFromScaleIn")


def _no_instances(result):
return dict(state=result, matcher="path", expected=True, argument="length(AutoScalingInstances[]) == `0`")

Expand All @@ -26,43 +43,78 @@ class AutoscalingWaiterFactory(BaseWaiterFactory):
@property
def _waiter_model_data(self):
data = dict(
instances_in_service=dict(
instances_healthy=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_health("HEALTHY"),
# Terminated Instances can't reach "Healthy"
_fail_on_instance_lifecycle_states("Terminating"),
_fail_on_instance_lifecycle_states("Terminated"),
_fail_on_instance_lifecycle_states("Terminating:Wait"),
_fail_on_instance_lifecycle_states("Terminating:Proceed"),
_fail_on_instance_lifecycle_states("Detaching"),
_fail_on_instance_lifecycle_states("Detached"),
_success_on_instance_lifecycle_states("InService"),
],
),
instances_in_standby=dict(
instances_unhealthy=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_health("UNHEALTHY"),
# Instances in an unhealthy state can end up being automatically terminated
_no_instances("success"),
],
),
instances_protected=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_protection(True),
],
),
instances_not_protected=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_protection(False),
# Instances without protection can end up being automatically terminated
_no_instances("success"),
],
),
instances_in_service=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_lifecycle_states("InService"),
# Terminated instances can't reach InService
_fail_on_instance_lifecycle_states("Terminating"),
_fail_on_instance_lifecycle_states("Terminated"),
_fail_on_instance_lifecycle_states("Terminating:Wait"),
_fail_on_instance_lifecycle_states("Terminating:Proceed"),
_fail_on_instance_lifecycle_states("Detaching"),
_fail_on_instance_lifecycle_states("Detached"),
_success_on_instance_lifecycle_states("Standby"),
],
),
instances_detached=dict(
instances_in_standby=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_lifecycle_states("Standby"),
# Terminated instances can't reach Standby
_fail_on_instance_lifecycle_states("Terminating"),
_fail_on_instance_lifecycle_states("Terminated"),
_fail_on_instance_lifecycle_states("Terminating:Wait"),
_fail_on_instance_lifecycle_states("Terminating:Proceed"),
],
),
instances_detached=dict(
operation="DescribeAutoScalingInstances",
delay=5,
maxAttempts=120,
acceptors=[
_success_on_instance_lifecycle_states("Detached"),
_no_instances("success"),
],
Expand Down
9 changes: 7 additions & 2 deletions plugins/module_utils/autoscaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@

import typing

# Not intended for general re-use / re-import
from ._autoscaling import common as _common
from ._autoscaling import groups as _groups
from ._autoscaling import instances as _instances
from ._autoscaling import transformations as _transformations
from ._autoscaling import waiters as _waiters
from ._autoscaling.common import AnsibleAutoScalingError # pylint: disable=unused-import
from ._autoscaling.common import AutoScalingErrorHandler # pylint: disable=unused-import
from .retries import AWSRetry

if typing.TYPE_CHECKING:
Expand All @@ -29,6 +29,11 @@
from .transformation import AnsibleAWSResourceList
from .transformation import BotoResourceList

# Intended for general use / re-import
AnsibleAutoScalingError = _common.AnsibleAutoScalingError
AutoScalingErrorHandler = _common.AutoScalingErrorHandler
WAITER_MAP = _waiters.WAITER_MAP


def get_autoscaling_groups(
client: RetryingBotoClientWrapper, group_names: Optional[List[str]] = None
Expand Down
14 changes: 12 additions & 2 deletions plugins/module_utils/waiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

try:
import botocore.waiter as botocore_waiter
except ImportError:
pass # Handled by AnsibleAWSModule
import_error = None
except ImportError as e:
botocore_waiter = None
import_error = e


class BaseWaiterFactory:
Expand All @@ -36,6 +38,9 @@ class BaseWaiterFactory:
"""

def __init__(self):
if not botocore_waiter:
return

# While it would be nice to supliment this with the upstream data,
# unfortunately client doesn't have a public method for getting the
# waiter configs.
Expand Down Expand Up @@ -124,6 +129,11 @@ def _inject_ratelimit_retries(self, model_data):
return _model_data

def get_waiter(self, client, waiter_name):
# We shouldn't get here, but if someone's trying to use this without botocore installed
# let's re-raise the actual import error
if import_error:
raise import_error

waiters = self._model.waiter_names
if waiter_name not in waiters:
raise NotImplementedError(f"Unable to find waiter {waiter_name}. Available_waiters: {waiters}")
Expand Down
106 changes: 92 additions & 14 deletions plugins/modules/autoscaling_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
- B(Note:) When adding instances to an AutoScaling Group or returning instances to service
from standby, the desired capacity is B(always) incremented. If the total number of
instances would exceed the maximum size of the group then the operation will fail.
choices: ['present', 'attached', 'terminate', 'detached', 'standby']
choices: ['present', 'attached', 'terminated', 'detached', 'standby']
default: present
type: str
instance_ids:
Expand Down Expand Up @@ -160,6 +160,7 @@
import typing
from copy import deepcopy

from ansible_collections.amazon.aws.plugins.module_utils.autoscaling import WAITER_MAP
from ansible_collections.amazon.aws.plugins.module_utils.autoscaling import AnsibleAutoScalingError
from ansible_collections.amazon.aws.plugins.module_utils.autoscaling import AutoScalingErrorHandler
from ansible_collections.amazon.aws.plugins.module_utils.autoscaling import get_autoscaling_instances
Expand All @@ -182,7 +183,7 @@


# There's also a number of "Warmed" states that we could support with relatively minimal effort, but
# we can't test them
# we can't test them (currently)
STATE_MAP = {
"pending": ["Pending", "Pending:Proceed", "Pending:Wait"],
"stable": ["InService", "Standby"],
Expand All @@ -199,6 +200,15 @@ def _all_instance_ids(instances: List) -> Set[str]:
return {i.get("instance_id") for i in instances}


def _instance_ids_with_health(instances: List, health: str) -> Set[str]:
health = health.lower()
return {i.get("instance_id") for i in instances if i.get("health_status", "").lower() == health}


def _instance_ids_with_protection(instances: List, protection: bool) -> Set[str]:
return {i.get("instance_id") for i in instances if i.get("protected_from_scale_in", False) == protection}


def _instance_ids_in_states(instances: List, states: List[str]) -> Set[str]:
states = [s.lower() for s in states]
return {i.get("instance_id") for i in instances if i.get("lifecycle_state", "").lower() in states}
Expand All @@ -213,6 +223,28 @@ def _token_instance(instance_id, group_name):
)


@AutoScalingErrorHandler.common_error_handler("set instance health")
@AWSRetry.jittered_backoff()
def _set_instance_health(client: RetryingBotoClientWrapper, instance_id: str, health: str, respect_grace: bool):
return client.set_instance_health(
InstanceId=instance_id,
HealthStatus=health,
ShouldRespectGracePeriod=respect_grace,
)


@AutoScalingErrorHandler.common_error_handler("set instance protection")
@AWSRetry.jittered_backoff()
def _set_instance_protection(
client: RetryingBotoClientWrapper, instance_ids: Set[str], group_name: str, protected: bool
):
return client.set_instance_protection(
InstanceIds=list(instance_ids),
AutoScalingGroupName=group_name,
ProtectedFromScaleIn=protected,
)


@AutoScalingErrorHandler.common_error_handler("detach auto scaling instances from group")
@AWSRetry.jittered_backoff()
def _detach_instances(
Expand Down Expand Up @@ -283,16 +315,9 @@ def wait_instance_state(
if not instance_ids:
return

waiter_map = {
"Standby": "instances_in_standby",
"Terminated": "instances_terminated",
"Detached": "instances_detached",
"InService": "instances_in_service",
}

waiter_config = custom_waiter_config(timeout=wait_timeout, default_pause=10)

waiter = get_autoscaling_waiter(client, waiter_map[state])
waiter = get_autoscaling_waiter(client, WAITER_MAP[state])
AutoScalingErrorHandler.common_error_handler(f"wait for instances to reach {state}")(waiter.wait)(
InstanceIds=list(instance_ids),
WaiterConfig=waiter_config,
Expand All @@ -313,8 +338,12 @@ def _inject_instances(instances, group_name, missing_ids):
def _change_instances(instances, group_name, change_ids, state=None, health=None, protection=None):
for instance in instances:
if instance.get("instance_id") in change_ids:
if state:
if state is not None:
instance["lifecycle_state"] = state
if health is not None:
instance["health_status"] = health
if protection is not None:
instance["protected_from_scale_in"] = protection
return instances


Expand Down Expand Up @@ -585,7 +614,33 @@ def ensure_instance_health(
wait: bool,
wait_timeout: int,
) -> Tuple[bool, AnsibleAWSResourceList]:
return False, instances_start
# nb. With Health the API documentation's inconsistent:
# it appears to want Capitalized for set(), but spits out UPPERCASE for get()
if health is None:
return False, instances_start
if instance_ids is None:
instance_ids = _all_instance_ids(instances_start)
else:
instance_ids = set(instance_ids)

ready_ids = _instance_ids_with_health(instances_start, health) & instance_ids
changed_ids = instance_ids - ready_ids

if not changed_ids:
return False, instances_start

if check_mode:
health = health.upper()
changed_instances = _change_instances(deepcopy(instances_start), group_name, changed_ids, health=health)
return True, changed_instances

for instance_id in changed_ids:
_set_instance_health(client, instance_id, health, respect_grace_period)
health = health.upper()
wait_instance_state(client, health.upper(), check_mode, group_name, changed_ids, wait, wait_timeout)

instances_complete = get_autoscaling_instances(client, group_name=group_name)
return True, instances_complete


def ensure_instance_protection(
Expand All @@ -602,8 +657,27 @@ def ensure_instance_protection(
return False, instances_start
if instance_ids is None:
instance_ids = _all_instance_ids(instances_start)
else:
instance_ids = set(instance_ids)

return False, instances_start
ready_ids = _instance_ids_with_protection(instances_start, protection) & instance_ids
changed_ids = instance_ids - ready_ids

if not changed_ids:
return False, instances_start

if check_mode:
changed_instances = _change_instances(deepcopy(instances_start), group_name, changed_ids, protection=protection)
return True, changed_instances

_set_instance_protection(client, changed_ids, group_name, protection)

state = "Protected" if protection else "NotProtected"

wait_instance_state(client, state, check_mode, group_name, changed_ids, wait, wait_timeout)

instances_complete = get_autoscaling_instances(client, group_name=group_name)
return True, instances_complete


def ensure_instance_pool(
Expand Down Expand Up @@ -722,8 +796,8 @@ def _validate_remove_conditions(params: Dict[str, Any], instances: AnsibleAWSRes


def _validate_attach_conditions(params: Dict[str, Any], instances: AnsibleAWSResourceList) -> None:
instance_ids = set(params.get("instance_ids"))
all_ids = _all_instance_ids(instances)
instance_ids = set(params.get("instance_ids") or [])

# These instances are terminating, we can't do anything with them.
terminating_ids = _instance_ids_in_states(instances, STATE_MAP["terminating+"]) & instance_ids
Expand Down Expand Up @@ -812,6 +886,10 @@ def do(module):
after=dict(auto_scaling_instances=instances),
)

result["changed_pool"] = changed_pool
result["changed_protection"] = changed_protection
result["changed_health"] = changed_health

module.exit_json(**result)


Expand Down
Loading

0 comments on commit 6d534b0

Please sign in to comment.