Identify slow or high latency traces for a given service or list of services in jaeger

Searches Jaeger for traces from specified services containing spans with duration > min_duration_ms in the last n minutes (or lookback_minutes), sorted by slowest span duration.

Requires inputs: lookback_minutes (int), target_services (array), min_duration_ms (int), limit(int)

import requests import json from datetime import datetime, timedelta # Get Jaeger URL from environment jaeger_url = getEnvVar('JAEGER_URL_OTEL') # Calculate time range end_time = datetime.now() start_time = end_time - timedelta(minutes=lookback_minutes) # Convert to microseconds (Jaeger expects microseconds) start_time_us = int(start_time.timestamp() * 1000000) end_time_us = int(end_time.timestamp() * 1000000) min_duration_us = min_duration_ms * 1000 print(f"Searching for high latency traces from {start_time} to {end_time}") print(f"Target services: {len(target_services)} services") print(f"Minimum span duration: {min_duration_ms}ms") high_latency_traces = [] trace_count = 0 # Search traces for each target service (limit to first 5 services to reduce output) services_to_check = target_services[:5] if len(target_services) > 5 else target_services for service in services_to_check: print(f"\nSearching traces for service: {service}") traces_url = f"{jaeger_url}traces" params = { 'service': service, 'start': start_time_us, 'end': end_time_us, 'limit': 50 # Reduced limit per service } try: response = requests.get(traces_url, params=params, timeout=6) response.raise_for_status() traces_data = response.json() if 'data' in traces_data: traces = traces_data['data'] print(f"Found {len(traces)} traces for {service}") service_traces = [] # <<< collect per-service candidates # Filter traces that have spans with duration > threshold for trace in traces: if 'spans' in trace: max_span_duration = 0 slow_span_info = None for span in trace['spans']: span_duration = span.get('duration', 0) if span_duration > min_duration_us and span_duration > max_span_duration: max_span_duration = span_duration slow_span_info = { 'operation': span.get('operationName', 'unknown'), 'duration_ms': round(span_duration / 1000, 2), 'span_id': span.get('spanID') } if slow_span_info: # Calculate total trace duration trace_duration = 0 if trace['spans']: min_start = min(span.get('startTime', 0) for span in trace['spans']) max_end = max(span.get('startTime', 0) + span.get('duration', 0) for span in trace['spans']) trace_duration = max_end - min_start # Find root operation root_spans = [s for s in trace['spans'] if not s.get('references')] root_operation = root_spans[0].get('operationName', 'unknown') if root_spans else 'unknown' trace_info = { 'traceID': trace.get('traceID'), 'service': service, 'root_operation': root_operation[:50] + '...' if len(root_operation) > 50 else root_operation, 'total_duration_ms': round(trace_duration / 1000, 2), 'slowest_span': slow_span_info, 'spans_count': len(trace['spans']), 'start_time': datetime.fromtimestamp(min_start / 1000000).strftime('%H:%M:%S') if trace['spans'] else None } service_traces.append(trace_info) # <<< collect per-service # <<< sort per-service and keep top-K (K = existing `limit`) service_traces.sort(key=lambda x: x['slowest_span']['duration_ms'], reverse=True) high_latency_traces.extend(service_traces[:limit]) except requests.exceptions.RequestException as e: print(f"Error fetching traces for {service}: {e}") except Exception as e: print(f"Error processing traces for {service}: {e}") # Sort merged results globally (optional, keeps your original print order idea) high_latency_traces.sort(key=lambda x: x['slowest_span']['duration_ms'], reverse=True) # Keep variable names the same: filtered_traces = high_latency_traces # <<< no global slice; already per-service top-K trace_count = len(filtered_traces) print(f"\nFound {trace_count} traces with spans > {min_duration_ms}ms (top {limit} per service):") for i, trace in enumerate(filtered_traces, 1): print(f"{i}. TraceID: {trace['traceID']} | Service: {trace['service']} | Root Op: {trace['root_operation']}") print(f" Slowest Span: {trace['slowest_span']['operation']} ({trace['slowest_span']['duration_ms']}ms)") print(f" Total Duration: {trace['total_duration_ms']}ms | Spans: {trace['spans_count']} | Time: {trace['start_time']}") print() print(f"high_latency_traces: {json.dumps(filtered_traces, indent=2)}") print(f"trace_count: {trace_count}")

copied

tooltask

Identify slow or high latency traces for a given service or list of services in jaeger

Resources

Company

Legal