Sign in

Stopping Underutilized AWS EC2 Instances

There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

In an AWS environment, EC2 instances incur charges based on their uptime. However, not all instances are actively utilized, leading to unnecessary expenses. Underutilized instances may have low CPU usage, minimal network activity, or other metrics indicating limited activity. Identifying and stopping such instances can result in significant cost savings. Tools like AWS Cost Explorer and third-party solutions can help identify these instances based on CloudWatch metrics. This runbook automates the process of monitoring and taking action on underutilized instances based on low CPU usage ensuring an optimized and cost-effective cloud environment. It's crucial, though, to ensure that stopping these instances won't disrupt essential services or applications.

CPU_THRESHOLD = 20 # Hardcoded for one time result LOOKBACK_PERIOD_HOURS = 2 # Hardcoded for one time result region_name=None
copied
  1. 1

    Get all AWS EC2 instances

    There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

    Amazon Elastic Compute Cloud (EC2) is a service offered by Amazon Web Services (AWS) that provides resizable compute capacity in the cloud. Through Boto3's EC2 client, the describe_instances() method provides detailed information about each instance, including its ID, type, launch time, and current state. This capability assists users in effectively monitoring and managing their cloud resources.

    import boto3 from botocore.exceptions import NoCredentialsError, PartialCredentialsError, BotoCoreError, ClientError creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] def list_all_regions(): ec2 = boto3.client('ec2',aws_access_key_id=access_key,aws_secret_access_key=secret_key, region_name = 'us-east-1') return [region['RegionName'] for region in ec2.describe_regions()['Regions']] def list_ec2_instances(region=None): # If no region is provided, fetch instances from all regions regions = [region] if region else list_all_regions() # Create an empty list to store instance details instance_details = [] for region in regions: # Try initializing the Boto3 EC2 client for the specific region try: ec2_client = boto3.client('ec2', aws_access_key_id=access_key,aws_secret_access_key=secret_key,region_name=region) except (NoCredentialsError, PartialCredentialsError): print(f"Failed for {region}: No AWS credentials found or incomplete credentials provided.") continue except BotoCoreError as e: print(f"Failed for {region}: Error initializing the EC2 client due to BotoCore Error: {e}") continue except Exception as e: print(f"Failed for {region}: Unexpected error initializing the EC2 client: {e}") continue #print(f"Fetching EC2 instance details for region: {region}...") # Try to paginate through the EC2 instance responses for the specific region try: paginator = ec2_client.get_paginator('describe_instances') for page in paginator.paginate(): for reservation in page['Reservations']: for instance in reservation['Instances']: # Extract the desired attributes instance_id = instance['InstanceId'] instance_type = instance['InstanceType'] launch_time = instance['LaunchTime'] state = instance['State']['Name'] # Append the details to the list instance_details.append({ 'InstanceId': instance_id, 'InstanceType': instance_type, 'LaunchTime': launch_time, 'State': state, 'Region': region }) #print(f"Fetched all instance details for region: {region} successfully!") except ClientError as e: print(f"Failed for {region}: AWS Client Error while fetching EC2 instance details: {e}") except Exception as e: print(f"Failed for {region}: Unexpected error while fetching EC2 instance details: {e}") return instance_details def display_instance_details(data): # Initialize table with the desired structure and headers table = context.newtable() table.title = "EC2 Instance Details" table.num_cols = 5 # Number of columns according to headers table.num_rows = 1 # Starts with one row for headers table.has_header_row = True # Define header names based on the new structure headers = ["Instance ID", "Instance Type", "Launch Time", "State", "Region"] # Set headers in the first row for col_num, header in enumerate(headers): table.setval(0, col_num, header) # Sort the instance data by launch time for better organization data.sort(key=lambda x: x["LaunchTime"], reverse=True) # Populate the table with instance data for row_num, instance in enumerate(data, start=1): # Starting from the second row table.num_rows += 1 # Add a row for each instance values = [ instance["InstanceId"], instance["InstanceType"], instance["LaunchTime"].strftime('%Y-%m-%d %H:%M:%S'), # Format the datetime instance["State"], instance["Region"] ] for col_num, value in enumerate(values): table.setval(row_num, col_num, value) # You can replace None with a specific region string like 'us-east-1' to get instances from a specific region # Hardcoded region_name for One time Execution Result region_name=None instances_list = list_ec2_instances(region_name) if instances_list: ''' print("\nEC2 Instance Details:") for instance in instances_list: print("-" * 50) # Separator line for key, value in instance.items(): print(f"{key}: {value}")''' display_instance_details(instances_list) else: print("No instances found or an error occurred.")
    copied
    1
  2. 2

    Average CPU Utilizations of all AWS EC2 Instances

    There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.
    import boto3 from datetime import datetime, timedelta from botocore.exceptions import NoCredentialsError, PartialCredentialsError, BotoCoreError, ClientError, EndpointConnectionError, DataNotFoundError # AWS credentials creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] if locals().get('instances_list') is None: instances_list = [] # Function to fetch CPU utilization for a given instance def fetch_cpu_utilization(instance_id, region, start_time, end_time): try: cloudwatch = boto3.client('cloudwatch', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) response = cloudwatch.get_metric_data( MetricDataQueries=[ { 'Id': 'cpuUtilization', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/EC2', 'MetricName': 'CPUUtilization', 'Dimensions': [{'Name': 'InstanceId', 'Value': instance_id}] }, 'Period': 3600, # one hour 'Stat': 'Average', }, 'ReturnData': True, }, ], StartTime=start_time, EndTime=end_time ) return response['MetricDataResults'][0]['Timestamps'], response['MetricDataResults'][0]['Values'] except Exception as e: print(f"Error getting CPU utilization for instance {instance_id}: {e}") return [], [] # Main plotting logic def plot_cpu_utilization(instances_list, lookback_days=7): end_time = datetime.utcnow() start_time = end_time - timedelta(days=lookback_days) for instance in instances_list: if instance['State'] != 'running': continue timestamps, cpu_values = fetch_cpu_utilization(instance['InstanceId'], instance['Region'], start_time, end_time) # Check if data is available if timestamps: context.plot.add_trace( name=f"Instance {instance['InstanceId']}", xpts=timestamps, # x-axis points ypts=cpu_values, # y-axis points tracetype="line" ) # Set plot properties context.plot.xlabel = 'Date' context.plot.ylabel = 'Average CPU Utilization (%)' context.plot.title = f'CPU Utilization per EC2 Instance (Last {lookback_days} Days)' # Execute the plotting function plot_cpu_utilization(instances_list)
    copied
    2
  3. 3

    Identify Idle AWS EC2 Instances

    There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

    AWS EC2 instances that are running but not actively used represent unnecessary costs. An "idle" EC2 instance typically exhibits very low metrics on parameters such as CPU utilization, network input/output, and disk read/writes. By leveraging AWS CloudWatch, users can monitor these metrics and identify instances that remain underutilized based on low CPU usage over extended periods. Once identified, these instances can either be stopped or terminated, leading to more efficient resource use and cost savings. It's important to analyze and verify the activity of these instances before taking action to ensure no critical processes are inadvertently affected.

    import boto3 from datetime import datetime, timedelta from botocore.exceptions import NoCredentialsError, PartialCredentialsError, BotoCoreError, ClientError, EndpointConnectionError, DataNotFoundError # Constants for CPU threshold and lookback period # CPU_THRESHOLD = 5.0 # LOOKBACK_PERIOD_HOURS = 1 creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] if locals().get('instances_list') is None: instances_list = [] def get_idle_instances(instances_list): idle_instances = [] end_time = datetime.utcnow() start_time = end_time - timedelta(hours=LOOKBACK_PERIOD_HOURS) for instance in instances_list: if instance['State'] != 'running': continue instance_id = instance['InstanceId'] region = instance['Region'] try: cloudwatch = boto3.client('cloudwatch',aws_access_key_id=access_key,aws_secret_access_key=secret_key, region_name=region) cpu_stats = cloudwatch.get_metric_data( MetricDataQueries=[ { 'Id': 'cpuUtil', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/EC2', 'MetricName': 'CPUUtilization', 'Dimensions': [{'Name': 'InstanceId', 'Value': instance_id}] }, 'Period': 3600, # one hour periods 'Stat': 'Average' }, 'ReturnData': True } ], StartTime=start_time, EndTime=end_time ) avg_cpu_utilization = sum(cpu_stats['MetricDataResults'][0]['Values']) / len(cpu_stats['MetricDataResults'][0]['Values']) if cpu_stats['MetricDataResults'][0]['Values'] else 0.0 # Calculate idle hours based on the CPU threshold checks if avg_cpu_utilization < CPU_THRESHOLD: idle_hours = sum(1 for val in cpu_stats['MetricDataResults'][0]['Values'] if val < CPU_THRESHOLD) instance_info = instance.copy() instance_info['IdleHours'] = idle_hours idle_instances.append(instance_info) except Exception as e: print(f"Error processing instance {instance_id} in region {region}: {e}") return idle_instances def display_instance_details(data): table = context.newtable() table.title = "Idle EC2 Instances" table.num_cols = 6 # Updated number of columns table.num_rows = 1 table.has_header_row = True headers = ["Instance ID", "Instance Type", "Launch Time", "State", "Region", "Idle Hours"] for col_num, header in enumerate(headers): table.setval(0, col_num, header) data.sort(key=lambda x: x["LaunchTime"], reverse=True) for row_num, instance in enumerate(data, start=1): table.num_rows += 1 values = [ instance["InstanceId"], instance["InstanceType"], instance["LaunchTime"].strftime('%Y-%m-%d %H:%M:%S'), instance["State"], instance["Region"], str(instance["IdleHours"]) # Ensure the idle hours are converted to string ] for col_num, value in enumerate(values): table.setval(row_num, col_num, value) # Main execution # Ensure to include your list_all_ec2_instances function or import it if it's in another module # instances_list = list_all_ec2_instances() Already taken from parent task idle_instances_list = get_idle_instances(instances_list) # Printing the details of idle instances if idle_instances_list: ''' print("\nIdle EC2 Instances:") for instance in idle_instances_list: print("-" * 60) # Separator line for key, value in instance.items(): print(f"{key}: {value}")''' display_instance_details(idle_instances_list) else: print("No idle instances found.") # Create a new list with only 'InstanceId' and 'Region' for each instance filtered_instances = [{'InstanceId': instance['InstanceId'], 'Region': instance['Region']} for instance in idle_instances_list] context.skip_sub_tasks=True ''' # Print the new list print("Printing instance_id and region wise instance list to check values for passing down to downstream task") for instance in filtered_instances: print(instance) '''
    copied
    3
    1. 3.1

      Stop an AWS EC2 Instance

      There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

      In AWS, an EC2 instance can be in various states, including running, stopped, or terminated. Stopping an EC2 instance essentially means shutting it down, similar to turning off a computer. When an instance is stopped, it is not running, and therefore, you are not billed for instance usage. However, you are still billed for any EBS storage associated with the instance. The advantage of stopping, instead of terminating, is that you can start the instance again at any time. This capability is useful for scenarios where you want to temporarily halt operations without losing the instance configuration or data. It's essential to understand that stopping an instance will lead to the loss of the ephemeral storage content (Instance Store), but data on EBS volumes will remain intact.

      import boto3 from botocore.exceptions import ClientError creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] if locals().get('filtered_instances') is None: filtered_instances = [] def stop_ec2_instances(instances_to_stop): # To keep track of instances successfully stopped stopped_instances = [] # To keep track of instances that failed to stop failed_instances = [] # To keep track of instances that were already stopped or in the process of stopping already_stopped_instances = [] # Iterate over each instance in the list for instance_info in instances_to_stop: instance_id = instance_info['InstanceId'] region = instance_info['Region'] # Initialize the EC2 client for the specific region ec2_client = boto3.client('ec2', aws_access_key_id=access_key,aws_secret_access_key=secret_key, region_name=region) # Fetch the current state of the instance try: response = ec2_client.describe_instances(InstanceIds=[instance_id]) instance_state = response['Reservations'][0]['Instances'][0]['State']['Name'] if instance_state in ["stopped", "stopping"]: already_stopped_instances.append(instance_id) print(f"Instance {instance_id} in region {region} is already in '{instance_state}' state.") continue # If the instance is not already stopped or stopping, then attempt to stop it ec2_client.stop_instances(InstanceIds=[instance_id]) stopped_instances.append(instance_id) print(f"Instance {instance_id} in region {region} has been stopped.") except ClientError as e: failed_instances.append(instance_id) print(f"Error with instance {instance_id} in region {region}: {e}") # Print a summary of the actions print("\nSummary:\n") if stopped_instances: print(f"Successfully stopped {len(stopped_instances)} instances: {', '.join(stopped_instances)}") if already_stopped_instances: print(f"{len(already_stopped_instances)} instances were already stopped or stopping: {', '.join(already_stopped_instances)}") if failed_instances: print(f"Failed to stop {len(failed_instances)} instances: {', '.join(failed_instances)}") ''' # Sample list of instances to stop taken from previous task or provide these instances to use the task in a standalone manner instances_to_stop = [ {'InstanceId': 'i-01615251421b8b5da', 'Region': 'us-east-1'}, {'InstanceId': 'i-057155192c87ea310', 'Region': 'us-east-1'} # ... (other instances) ] ''' stop_ec2_instances(filtered_instances) # passed down from previous task otherwise pass instances_to_stop to function to use the task in a standalone manner.
      copied
      3.1