Sign in

Filter Out AWS EC2 Instances which are failing health checks

There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

This task involves identifying and segregating instances based on their health status. This process checks the instance status, identifying any instances that do fail the instance health checks. The aim is to isolate these instances for troubleshooting, recovery, or termination to ensure the reliability and efficiency of cloud operations.

import boto3 from datetime import datetime # Load AWS credentials from a secure source creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] def get_ec2_client(region_name, access_key, secret_key): try: return boto3.client( 'ec2', region_name=region_name, aws_access_key_id=access_key, aws_secret_access_key=secret_key ) except Exception as e: print(f"Failed to create EC2 client for region {region_name}: {e}") return None def check_instance_health(ec2_client, instance_id): if not ec2_client: return False try: response = ec2_client.describe_instance_status( InstanceIds=[instance_id], IncludeAllInstances=True ) if response['InstanceStatuses']: for status in response['InstanceStatuses']: if status['InstanceState']['Name'] == 'running': system_status = status['SystemStatus']['Status'] instance_status = status['InstanceStatus']['Status'] return system_status == 'ok' and instance_status == 'ok' return False except Exception as e: print(f"Error checking health for instance {instance_id}: {e}") return False def alert_on_health_check_failures(instances_list, access_key, secret_key): failed_instances = [] healthy_instances = [] # Filter only running instances before performing health checks running_instances = [instance for instance in instances_list if instance['State'] == 'running'] for instance in running_instances: ec2_client = get_ec2_client(instance['Region'], access_key, secret_key) if ec2_client: if check_instance_health(ec2_client, instance['InstanceId']): healthy_instances.append(instance) print(f"Instance {instance['InstanceId']} in region {instance['Region']} passed health checks.") else: failed_instance = { **instance, 'AlertTime': datetime.now().isoformat() # Convert datetime to string } failed_instances.append(failed_instance) print(f"Alert: Instance {instance['InstanceId']} in region {instance['Region']} is failing health checks at {failed_instance['AlertTime']}") else: print(f"Failed to initialize EC2 client for {instance['Region']}. Skipping instance {instance['InstanceId']}.") print(f"Summary: {len(healthy_instances)} instances passed health checks, {len(failed_instances)} failed.") return failed_instances # Assuming instances_list is fetched from an upstream task failed_instances = alert_on_health_check_failures(instances_list, access_key, secret_key) if failed_instances: # Convert failed_instances for safe printing print("Failed Instances:") for instance in failed_instances: print(f"{instance['InstanceId']} in {instance['Region']} at {instance['AlertTime']}") # Directly use the pre-formatted string else: print("No EC2 Instances with Failing Health Checks found.") context.skip_sub_tasks = True
copied
  1. 1

    Alert recipient email addresses for failing AWS EC2 Instances Health Checks

    There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

    The task involves sending automated email alerts to designated recipients about AWS EC2 instances that fail health checks, using AWS SNS for timely notifications to enhance operational response and system reliability.

    import boto3 from botocore.exceptions import ClientError # Load AWS credentials from a secure source creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] def get_sns_client(access_key, secret_key, aws_region="us-east-1"): """ Initialize an SNS client for the primary notification region. """ return boto3.client( 'sns', region_name=aws_region, aws_access_key_id=access_key, aws_secret_access_key=secret_key ) def find_sns_topic(sns_client, topic_name): """ Attempt to find an existing SNS topic by name. """ try: response = sns_client.list_topics() for topic in response['Topics']: if topic_name in topic['TopicArn']: return topic['TopicArn'] except ClientError as e: print(f"Error retrieving SNS topics: {e}") return None def setup_sns_topic(topic_name, sns_client): """ Setup or retrieve an SNS topic and return its ARN. """ topic_arn = find_sns_topic(sns_client, topic_name) if not topic_arn: try: topic = sns_client.create_topic(Name=topic_name) topic_arn = topic['TopicArn'] except ClientError as e: print(f"Failed to create SNS topic: {e}") return None return topic_arn def check_subscription(topic_arn, email_address, sns_client): """ Check if an email address is already subscribed to the topic. """ try: response = sns_client.list_subscriptions_by_topic(TopicArn=topic_arn) for subscription in response['Subscriptions']: if subscription['Protocol'] == 'email' and subscription['Endpoint'] == email_address: print(f"{email_address} is already subscribed to the topic.") return True except ClientError as e: print(f"Error checking subscriptions for {email_address}: {e}") return False def subscribe_to_topic(topic_arn, email_address, sns_client): """ Subscribe an email address to an SNS topic if not already subscribed. """ if not check_subscription(topic_arn, email_address, sns_client): try: sns_client.subscribe( TopicArn=topic_arn, Protocol='email', Endpoint=email_address ) print(f"Subscription request sent to {email_address}. They need to confirm the subscription.") except ClientError as e: print(f"Failed to subscribe {email_address}: {e}") def publish_to_topic(topic_arn, failed_instances, sns_client): """ Publish a message to an SNS topic about failing EC2 instances. """ message = ("EC2 Instance Health Check Failure\n\n" "The following instances are failing health checks:\n" + "\n".join([f"Instance ID: {inst['InstanceId']} in {inst['Region']} at {inst['AlertTime']}" for inst in failed_instances])) subject = "Alert: EC2 Instance Health Check Failure" try: response = sns_client.publish( TopicArn=topic_arn, Message=message, Subject=subject ) print(f"Message published to SNS topic: {response['MessageId']}") except ClientError as e: print(f"An error occurred: {e}") # Main function to process failed instances and alert via SNS in a primary region def alert_failed_instances(failed_instances, access_key, secret_key, recipient_email, sns_topic_name="ec2-health-alerts", aws_region="us-east-1"): """ Main function to process failed instances and alert via SNS. :param failed_instances: List of dictionaries containing instance details. :param access_key: AWS access key ID. :param secret_key: AWS secret access key. :param recipient_email: Email address to receive alerts. :param sns_topic_name: (Optional) SNS topic name for publishing alerts. :param aws_region: (Optional) AWS region for the SNS client and topic. """ sns_client = get_sns_client(access_key, secret_key, aws_region) topic_arn = setup_sns_topic(sns_topic_name, sns_client) if topic_arn: subscribe_to_topic(topic_arn, recipient_email, sns_client) publish_to_topic(topic_arn, failed_instances, sns_client) '''failed_instances = [ {'InstanceId': 'i-0123456789abcdef0', 'Region': 'us-west-2', 'AlertTime': '2024-05-17T12:33:21.757843'}, {'InstanceId': 'i-023456789abcdef01', 'Region': 'us-east-1', 'AlertTime': '2024-05-17T12:33:23.036687'} ]''' #recipient_email='xyz@example.com' sns_topic_name='ec2-health-alerts' alert_failed_instances(failed_instances, access_key, secret_key, recipient_email)
    copied
    1