A4lGERku9Zw0d4netiBhCost Tracking with CUR test
Cost Tracking with CUR test
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.
inputs
outputs
- 1ugY8OF6Hleyi7YVYTQdtFetching AWS Cost and Usage Report from S3 test
1
Fetching AWS Cost and Usage Report from S3 test
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputsimport boto3 import gzip import pandas as pd from io import StringIO from datetime import datetime, timedelta from botocore.exceptions import ParamValidationError #BUCKET_NAME = 'dagknowscostreport' #BASE_PATH = 'costreport/dagknowscostreport/' # Is the path from bucket root directory to the 'dated folders' which contain the CUR Reports #FILENAME = 'dagknowscostreport-00001.csv.gz' #last_n_days = 100 # Retrieve AWS credentials from the vault creds = _get_creds(cred_label)['creds'] access_key = creds['username'] secret_key = creds['password'] s3_client = boto3.client('s3',aws_access_key_id=access_key,aws_secret_access_key=secret_key) def list_s3_keys(bucket, prefix): s3 = boto3.client('s3',aws_access_key_id=access_key,aws_secret_access_key=secret_key) keys = [] kwargs = {'Bucket': bucket, 'Prefix': prefix} while True: response = s3.list_objects_v2(**kwargs) for obj in response.get('Contents', []): keys.append(obj['Key']) try: kwargs['ContinuationToken'] = response['NextContinuationToken'] except KeyError: break return keys def fetch_data_from_s3(file_key): try: s3 = boto3.client('s3',aws_access_key_id=access_key,aws_secret_access_key=secret_key) response = s3.get_object(Bucket=BUCKET_NAME, Key=file_key) gz_content = response['Body'].read() csv_content = gzip.decompress(gz_content).decode('utf-8') return pd.read_csv(StringIO(csv_content), low_memory=False) except Exception as e: print(f"Error fetching data from S3 for key {file_key}: {e}") return None # Function to get the end of the previous month def get_end_of_last_month(date): return (date.replace(day=1) - timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0) def check_column_existence(df, column_name): if column_name not in df.columns: print(f"Warning: Column '{column_name}' not found in the DataFrame!") return False print(f"Column '{column_name}' exists in the DataFrame.") return True def list_folders(prefix): paginator = s3_client.get_paginator('list_objects_v2') folders = [] for page in paginator.paginate(Bucket=BUCKET_NAME, Prefix=prefix, Delimiter='/'): folders.extend([content['Prefix'] for content in page.get('CommonPrefixes', [])]) return folders def list_csv_gz_files(folder): response = s3_client.list_objects_v2(Bucket=BUCKET_NAME, Prefix=folder) files = [{'key': obj['Key'], 'last_modified': obj['LastModified']} for obj in response.get('Contents', []) if obj['Key'].endswith('.csv.gz')] return files ''' def print_last_file_info(files): if not files: print("No CSV GZ files found in the folder.") return # Sort files by last modified time last_file = sorted(files, key=lambda x: x['last_modified'], reverse=True)[0] print(f"Last file: {last_file['key']}, Last modified: {last_file['last_modified']}") ''' def process_data(bucket_name, base_path, last_n_days): try: # Setup for fetching data end_date = datetime.utcnow() - timedelta(days=1) start_date = end_date - timedelta(days=last_n_days) # Modified to use the list_folders function for getting month range folders month_ranges = list_folders(base_path) all_keys = [] for month_range in month_ranges: # Logic to fetch the last .csv.gz file from the last folder of the month keys_for_month = list_csv_gz_files(month_range) if keys_for_month: # Sort files by last modified time and get the last file last_file = sorted(keys_for_month, key=lambda x: x['last_modified'], reverse=True)[0] all_keys.append(last_file['key']) print(f"Last file for {month_range}: {last_file['key']}, Last modified: {last_file['last_modified']}") else: print(f"No CSV GZ files found in the folder {month_range}.") # Fetch and process data from identified keys dfs = [fetch_data_from_s3(key) for key in all_keys] df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() # Further processing on the DataFrame df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate']) df['day'] = df['lineItem/UsageStartDate'].dt.date # Exclude the latest date from the DataFrame latest_date = df['day'].max() df = df[df['day'] < latest_date] # Check required columns required_columns = [ 'lineItem/ProductCode', 'lineItem/UnblendedCost', 'lineItem/BlendedCost', 'lineItem/UsageStartDate', 'lineItem/UsageAccountId', 'lineItem/NormalizedUsageAmount', 'product/productFamily', 'product/instanceType' ] # Check if all required columns exist if not all([check_column_existence(df, col) for col in required_columns]): print("One or more required columns are missing.") return pd.DataFrame(), False # Returning an empty DataFrame and False return df, len(dfs) > 0 except Exception as e: print(f"ERROR: {e}") return pd.DataFrame(), False folders = list_folders(BASE_PATH) for folder in folders: print(f"Processing folder: {folder}") files = list_csv_gz_files(folder) #print_last_file_info(files) df, data_fetched = process_data(BUCKET_NAME, BASE_PATH, last_n_days) if data_fetched: print("Proceeding with further operations") else: print("No data fetched. Exiting operation.") context.proceed = False print("Script Execution End")copied1 - 2qhWSd60WuqvTl2PwfEhwtest dataframe operations for cur
2
test dataframe operations for cur
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputs2- 2.1fnVe3YhjJYlysqrihdF1AWS RDS Cost Analysis based on Instance Type
2.1
AWS RDS Cost Analysis based on Instance Type
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.This task involves comparing daily cost of all RDS Instances in an AWS Account, helping organizations choose the most economical options for their database needs while optimizing their AWS budget
inputsoutputsimport pandas as pd from datetime import datetime, timedelta, timezone # Assuming df is your DataFrame and last_n_days is defined # Example: last_n_days = 30 if df is not None: #print("DataFrame Columns:", list(df.columns)) # Convert 'lineItem/UsageStartDate' to datetime and ensure it's in UTC df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate'], utc=True) # Filter out negative costs df = df[df['lineItem/UnblendedCost'] >= 0] # Adjust the cutoff date to the start of the day in UTC cutoff_date = (datetime.utcnow() - timedelta(days=last_n_days)).replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc) rds_df = df[(df['lineItem/UsageStartDate'] > cutoff_date) & (df['product/ProductName'] == 'Amazon Relational Database Service') & (df['product/instanceType'].notna())] '''# First full day after cutoff first_full_day_after_cutoff = cutoff_date + timedelta(days=1) rds_df = rds_df[rds_df['lineItem/UsageStartDate'] >= first_full_day_after_cutoff]''' if not rds_df.empty: # Group by 'lineItem/UsageStartDate' and 'product/instanceType' for RDS rds_result = rds_df.groupby([rds_df['lineItem/UsageStartDate'].dt.date, 'product/instanceType']).agg( usage_hours=('lineItem/UsageAmount', 'sum'), usage_cost=('lineItem/UnblendedCost', 'sum') ).reset_index() #print("Number of rows in result DataFrame:", len(rds_result)) # Plotting the RDS cost data context.plot.xlabel = 'Date' context.plot.ylabel = 'RDS Usage Cost($)' context.plot.title = f'RDS Cost Usage (Last {last_n_days} Days)' for instance_type in rds_result['product/instanceType'].unique(): instance_data = rds_result[rds_result['product/instanceType'] == instance_type] x = instance_data['lineItem/UsageStartDate'].tolist() y = instance_data['usage_cost'].tolist() #print(f"Instance Type: {instance_type}") #print(f"Sample Dates (x): {x[:5]}") #print(f"Sample Costs (y): {y[:5]}") context.plot.add_trace(name=f"RDS- {instance_type}", xpts=x, ypts=y, tracetype="line") else: print("No data available for RDS in the specified time frame.") else: print("DataFrame is empty. Exiting.") #context.proceed=Falsecopied2.1 - 2.2knsCAmXmZZ4hdV9sQOoWGet unique values column wise in a pandas data frame
2.2
Get unique values column wise in a pandas data frame
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputsimport pandas as pd from datetime import datetime, timedelta # Define last_n_days parameter #last_n_days = 30 # You can change this value as needed if df is not None: print("Analyzing and visualizing daily AWS costs...") # Filter out negative values filter_condition = (df['lineItem/UnblendedCost'] >= 0) #| (df['lineItem/UnblendedCost'] <= -0.001) df = df[filter_condition] # Convert 'lineItem/UsageStartDate' to datetime df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate']) # Extract the day of the month and create a new column 'day' df['day'] = df['lineItem/UsageStartDate'].dt.date # Get the maximum date in the dataset max_date = df['day'].max() # Calculate the start date based on the max date and last_n_days start_date = max_date - timedelta(days=last_n_days) # Filter the DataFrame to only include dates greater than or equal to the start date filtered_df = df[df['day'] >= start_date] # Group by day and sum 'lineItem/UnblendedCost' for each group daily_costs = filtered_df.groupby('day')['lineItem/UnblendedCost'].sum() # Extract x and y values x = daily_costs.index.tolist() # This gets the dates y = daily_costs.values.tolist() # This gets the corresponding costs #print(df.head()) # Print the first few rows of the original DataFrame #print(daily_costs) # Print the daily costs after grouping and summing #print(y) # Print the y values extracted from daily_costs # Set the properties for your plot context.plot.xlabel = 'Date' context.plot.ylabel = 'Cost ($)' context.plot.title = f'Daily AWS Costs (Last {last_n_days} Days)' context.plot.add_trace(name=f'Daily AWS Costs (Last {last_n_days} Days)', xpts=x, ypts=y, tracetype="lines") else: print("Failed to fetch data. Exiting.") context.proceed=False ''' columns= ['lineItem/ProductCode', 'lineItem/Description', 'lineItem/UsageType', 'lineItem/Operation', 'product/instanceFamily', 'product/instanceType', 'product/instanceTypeFamily'] for column_to_investigate in columns: # Checking if the column exists in the DataFrame if column_to_investigate in df.columns: # Print all unique values in the column unique_values = df[column_to_investigate].unique() print(f"Unique values in '{column_to_investigate}':\n{unique_values}") else: print(f"The column '{column_to_investigate}' does not exist in the DataFrame.") '''copied2.2 - 2.3T0jor5KG4bxHfxkBMVyprds cost analysis by instance id
2.3
rds cost analysis by instance id
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputsimport pandas as pd from datetime import datetime, timedelta, timezone # Assuming df is your DataFrame and last_n_days is defined # Example: last_n_days = 30 if df is not None: print("DataFrame Columns:", list(df.columns)) # Convert 'lineItem/UsageStartDate' to datetime df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate']) # Filter out negative costs df = df[df['lineItem/UnblendedCost'] >= 0] # Filter rows for the last N days for RDS cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(days=last_n_days) rds_df = df[(df['lineItem/UsageStartDate'] > cutoff_date) & (df['lineItem/ProductCode'] == 'AmazonRDS') & (df['lineItem/ResourceId'].notna())] if not rds_df.empty: # Group by 'lineItem/UsageStartDate' and 'lineItem/ResourceId' rds_result = rds_df.groupby([rds_df['lineItem/UsageStartDate'].dt.date, 'lineItem/ResourceId']).agg( usage_hours=('lineItem/UsageAmount', 'sum'), usage_cost=('lineItem/UnblendedCost', 'sum') ).reset_index() print("Number of rows in result DataFrame:", len(rds_result)) # Plotting the RDS cost data by instance ID context.plot.xlabel = 'Date' context.plot.ylabel = 'RDS Usage Cost($)' context.plot.title = f'RDS Cost by Instance ID (Last {last_n_days} Days)' for instance_id in rds_result['lineItem/ResourceId'].unique(): instance_data = rds_result[rds_result['lineItem/ResourceId'] == instance_id] x = instance_data['lineItem/UsageStartDate'].tolist() y = instance_data['usage_cost'].tolist() context.plot.add_trace(name=f"RDS Instance - {instance_id}", xpts=x, ypts=y, tracetype="line") else: print("No RDS data available for the specified time frame.") else: print("DataFrame is empty. Exiting.")copied2.3 - 2.4CQ4CIPSov8TtTznjVvJ1AWS RDS Cost holistic
2.4
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputsimport pandas as pd import numpy as np from datetime import datetime, timezone # Assuming df is your DataFrame with AWS Cost and Usage Report data if df is not None: # Convert date columns and ensure numeric columns are correct df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate']) numeric_cols = ['lineItem/UsageAmount', 'lineItem/UnblendedCost', 'reservation/EffectiveCost', 'reservation/UnusedAmortizedUpfrontFeeForBillingPeriod', 'reservation/UnusedRecurringFee', 'reservation/AmortizedUpfrontFeeForBillingPeriod'] df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce') df.fillna(0, inplace=True) # Filter for RDS and specific line item types rds_df = df[(df['product/ProductName'] == 'Amazon Relational Database Service') & (df['lineItem/LineItemType'].isin(['DiscountedUsage', 'Usage', 'Fee', 'RIFee']))] # Calculations based on CASE WHEN conditions conditions = [ (rds_df['lineItem/LineItemType'] == 'DiscountedUsage'), (rds_df['lineItem/LineItemType'] == 'Usage'), (rds_df['lineItem/LineItemType'] == 'RIFee') ] choices = [ rds_df['lineItem/UsageAmount'], rds_df['lineItem/UsageAmount'], 0 ] rds_df['sum_lineItem_UsageAmount'] = np.select(conditions, choices, default=0) rds_df['sum_amortized_cost'] = np.select(conditions, [rds_df['reservation/EffectiveCost'], rds_df['reservation/UnusedAmortizedUpfrontFeeForBillingPeriod'] + rds_df['reservation/UnusedRecurringFee'], rds_df['lineItem/UnblendedCost']], default=rds_df['lineItem/UnblendedCost']) rds_df['sum_ri_trueup'] = np.where(rds_df['lineItem/LineItemType'] == 'RIFee', -rds_df['reservation/AmortizedUpfrontFeeForBillingPeriod'], 0) # Grouping and aggregation with 'pricing/term' grouped_result = rds_df.groupby([ 'bill/PayerAccountId', 'lineItem/UsageAccountId', rds_df['lineItem/UsageStartDate'].dt.date, 'product/instanceType', 'lineItem/Operation', 'lineItem/UsageType', 'lineItem/LineItemType', 'pricing/term', 'product/productFamily', rds_df['lineItem/ResourceId'].str.split(':', expand=True)[6], 'product/databaseEngine' ]).agg( sum_lineItem_UnblendedCost=('lineItem/UnblendedCost', 'sum'), sum_lineItem_UsageAmount=('sum_lineItem_UsageAmount', 'sum'), sum_amortized_cost=('sum_amortized_cost', 'sum'), sum_ri_trueup=('sum_ri_trueup', 'sum') ).reset_index() # Visualization with context.plot context.plot.xlabel = 'Date' context.plot.ylabel = 'Cost' context.plot.title = 'AWS RDS Cost Analysis' for _, row in grouped_result.iterrows(): context.plot.add_trace(name=f"RDS - {row['product/instanceType']}", xpts=[row['lineItem/UsageStartDate']], ypts=[row['sum_lineItem_UnblendedCost']], tracetype="line") print("Data analysis and plotting complete.") else: print("DataFrame is empty. Exiting.")copied2.4 - 2.5xp9aAsSwUKwYtnJLMAhArds costs by instance id
2.5
There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.inputsoutputsimport pandas as pd from datetime import datetime, timedelta, timezone # Assuming df is your DataFrame and last_n_days is defined # Example: last_n_days = 30 if df is not None: print("DataFrame Columns:", list(df.columns)) # Convert 'lineItem/UsageStartDate' to datetime df['lineItem/UsageStartDate'] = pd.to_datetime(df['lineItem/UsageStartDate']) # Filter out negative costs df = df[df['lineItem/UnblendedCost'] >= 0] # Filter rows for the last N days for RDS cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(days=last_n_days) rds_df = df[(df['lineItem/UsageStartDate'] > cutoff_date) & (df['lineItem/ProductCode'] == 'AmazonRDS') & (df['lineItem/ResourceId'].notna())] if not rds_df.empty: # Group by 'lineItem/UsageAccountId', 'lineItem/ResourceId', and 'lineItem/UsageStartDate' grouped_rds = rds_df.groupby(['lineItem/UsageAccountId', 'lineItem/ResourceId', rds_df['lineItem/UsageStartDate'].dt.date]).agg( total_usage=('lineItem/UsageAmount', 'sum'), total_cost=('lineItem/UnblendedCost', 'sum') ).reset_index() print("Number of rows in result DataFrame:", len(grouped_rds)) # Plotting the RDS cost data for each instance ID context.plot.xlabel = 'Date' context.plot.ylabel = 'RDS Instance Usage Cost($)' context.plot.title = f'RDS Instance Cost Usage (Last {last_n_days} Days)' for instance_id in grouped_rds['lineItem/ResourceId'].unique(): instance_data = grouped_rds[grouped_rds['lineItem/ResourceId'] == instance_id] x = instance_data['lineItem/UsageStartDate'].tolist() y = instance_data['total_cost'].tolist() print(f"Instance ID: {instance_id}") print(f"Dates (x values): {x}") print(f"Costs (y values): {y}") context.plot.add_trace(name=f"RDS Instance - {instance_id}", xpts=x, ypts=y, tracetype="line") else: print("No RDS data available in the specified time frame.") else: print("DataFrame is empty. Exiting.")copied2.5