Sign in
agent:
Auto Exec

Summarize all recent exceptions and errors for a given service (eg: cart or payment) in jaeger.

There was a problem that the LLM was not able to address. Please rephrase your prompt and try again.

Searches Jaeger for traces from single service like (cart or payment) that contain errors (error=true) or HTTP status codes >= 400, then analyzes and counts exception types

Requires lookback_minutes (int), target_services (array), and limit as 5. Use lookback_minutes as 60 if not defined.

# Jaeger error/latency scan (compact output) # ------------------------------------------------------ # WHAT THIS DOES # - Scans recent traces per target service from Jaeger. # - Detects "problems" using span tags: # * otel.status_code == "ERROR" # * error == true # * http.status_code >= MIN_STATUS_CODE (default 400) # * grpc status code != 0 # * any "exception.*" tag # - Buckets similar problems by: (service, root_operation, error_key) # * error_key is one of: exception:<type> | http:<code> | grpc=<code> | otel_status=ERROR | error=true | unknown_error # - Keeps only the BEST example per bucket (most recent; if tie, longest duration) # - Caps how many buckets we keep globally and per service to prevent huge outputs. # # INPUTS expected in your runtime: # JAEGER_URL_OTEL : str (e.g., http://jaeger:16686 or http://jaeger:16686/ui) # lookback_minutes : int (e.g., 60) # target_services : list[str] (e.g., ["checkout","payment","frontend"]) # limit : int (per-service fetch cap, e.g., 100) # # OPTIONAL tuning knobs (set in your runner if you want, else defaults below): # MAX_BUCKETS_GLOBAL : int (default 50) -> total distinct error buckets to output # MAX_BUCKETS_PER_SERVICE : int (default 10) -> max buckets per service # MIN_STATUS_CODE : int (default 400) -> http status threshold to treat as error # MIN_DURATION_MS : int (default 0) -> only keep examples with duration >= this (0 = off) # REQUIRE_ERROR_TAG : bool (default False)-> if True, require explicit error tags (otel/error/grpc/exception) # VERBOSE : bool (default False)-> if True, prints extra progress logs # # OUTPUT SHAPE (printed as JSON): # { # "window_minutes": <int>, # "services": [ ... ], # "caps": { "per_service_buckets": <int>, "global_buckets": <int>, "per_service_fetch_limit": <int> }, # "service_rollup": { # "<service>": { "buckets": <int>, "total_matches": <int> }, # ... # }, # "error_traces": [ # { # "signature": { # "service": "<service>", # "root_operation": "<root operation name>", # "key": "<error key>", # e.g., "http:500", "exception:FooError", "grpc=14", "otel_status=ERROR", "error=true" # "signature_key": "<service>|<root_operation>|<key>" # readable bucket key (no hashing) # }, # "count": <int>, # how many traces matched this signature # "example": { # best example for this signature (trimmed) # "trace_id": "<traceID>", # "service": "<service>", # "root_operation": "<root operation name>", # "duration_ms": <float>, # "start_time": "<ISO8601 UTC>", # "indicators": ["http=500", "grpc=14", "otel_status=ERROR", ...] # up to ~6 tags, trimmed # } # }, # ... # ], # "error_traces_count": <int>, # sum of counts across all buckets # "distinct_error_signatures": <int> # number of buckets kept after caps # } # # WHY error_traces IS SMALL: # - We group similar errors and keep ONE best example per group. # - We cap the number of groups globally and per service. # - We trim long indicator lists. import json import requests from datetime import datetime, timedelta from collections import defaultdict error_traces = [] # ----- Inputs from your runner / env ----- JAEGER_URL = getEnvVar("JAEGER_URL_OTEL").rstrip("/") # Required inputs (provided by your tool form / runner) lookback_minutes = int(lookback_minutes) # e.g., 60 target_services = target_services # e.g., ["checkout","payment","frontend"] limit = int(limit) # e.g., 100 # >>> ADDED: allow script to handle a single service input <<< # Supports variables single_service or service for one-target runs try: single_service = single_service.strip() if single_service else None except NameError: single_service = None if not single_service: try: single_service = service.strip() if service else None except NameError: single_service = None # If a single service name is provided, override target_services if single_service: target_services = [single_service] # <<< END ADDITION >>> # Optional knobs with sensible defaults try: MAX_BUCKETS_GLOBAL = int(MAX_BUCKETS_GLOBAL) except: MAX_BUCKETS_GLOBAL = 50 try: MAX_BUCKETS_PER_SERVICE = int(MAX_BUCKETS_PER_SERVICE) except: MAX_BUCKETS_PER_SERVICE = 10 try: MIN_STATUS_CODE = int(MIN_STATUS_CODE) except: MIN_STATUS_CODE = 400 try: MIN_DURATION_MS = int(MIN_DURATION_MS) except: MIN_DURATION_MS = 0 try: REQUIRE_ERROR_TAG = bool(REQUIRE_ERROR_TAG) except: REQUIRE_ERROR_TAG = False try: VERBOSE = bool(VERBOSE) except: VERBOSE = False # ----- Helpers ----- def api_base(url: str) -> str: # Accepts http(s)://host[:port], /ui, /jaeger variants and returns .../api if url.endswith("/api"): return url if url.endswith("/ui"): return url + "/api" if url.endswith("/jaeger"): return url + "/ui/api" return url + "/api" BASE = api_base(JAEGER_URL) def to_us(dt: datetime) -> int: return int(dt.timestamp() * 1_000_000) end_time = datetime.utcnow() start_time = end_time - timedelta(minutes=lookback_minutes) start_us, end_us = to_us(start_time), to_us(end_time) def safe_int(v): try: return int(v) except Exception: try: return int(str(v)) except Exception: return None def find_error_indicators(span): """ Return (has_error, indicators:set[str], http_status:int|None, exception_type:str|None) Indicators include: otel_status=ERROR, error=true, http=xxx, grpc=xxx, exception=<type> """ has_err = False ind = set() status = None ex_type = None for t in span.get("tags", []): k, v = t.get("key"), t.get("value") sv = str(v) if v is not None else "" if k == "otel.status_code" and sv.upper() == "ERROR": has_err = True; ind.add("otel_status=ERROR") elif k == "error": if (isinstance(v, bool) and v) or sv.lower() == "true": has_err = True; ind.add("error=true") elif k in ("http.status_code", "http.response.status_code"): code = safe_int(v) if code is not None: ind.add(f"http={code}") if status is None: status = code if code >= MIN_STATUS_CODE: has_err = True elif k in ("rpc.grpc.status_code", "grpc.code", "grpc.status_code"): g = safe_int(v) if g is not None: ind.add(f"grpc={g}") if g != 0: has_err = True elif "exception" in (k or "").lower(): if sv: ex_type = ex_type or sv ind.add(f"exception={sv[:80]}") has_err = True return has_err, ind, status, ex_type def trace_root_op(trace): spans = trace.get("spans", []) if not spans: return "unknown" root = next((s for s in spans if not s.get("references")), None) return (root or spans[0]).get("operationName", "unknown") def trace_duration_ms(trace): spans = trace.get("spans", []) if not spans: return 0.0 return max((sp.get("duration", 0) or 0) / 1000.0 for sp in spans) def trace_start_iso(trace): spans = trace.get("spans", []) if not spans: return None st = min(sp.get("startTime", 0) for sp in spans) return datetime.utcfromtimestamp(st / 1_000_000).isoformat() + "Z" def error_key_for_bucket(http_status, exception_type, indicators): """ Priority for error_key: 1) exception:<type> 2) http:<code> (if >= MIN_STATUS_CODE) 3) grpc=<code> 4) otel_status=ERROR 5) error=true 6) unknown_error """ if exception_type: return f"exception:{exception_type}" if http_status is not None and http_status >= MIN_STATUS_CODE: return f"http:{http_status}" for i in indicators: if i.startswith("grpc="): return i if "otel_status=ERROR" in indicators: return "otel_status=ERROR" if "error=true" in indicators: return "error=true" return "unknown_error" # def fetch_traces(service): # params = { # "service": service, # "start": start_us, # "end": end_us, # "limit": limit, # } # r = requests.get(f"{BASE}/traces", params=params, timeout=6) # r.raise_for_status() # data = r.json() # return data.get("data", data) def fetch_traces(service): params = { "service": service, "start": start_us, "end": end_us, "limit": limit, } try: resp = requests.get(f"{BASE}/traces", params=params, timeout=6) resp.raise_for_status() data = resp.json() return data.get("data", data) except requests.exceptions.RequestException as e: print(f"[WARN] Error fetching traces for {service}: {e}") return [] except Exception as e: print(f"[WARN] Error processing Jaeger response for {service}: {e}") return [] # ----- Scan & bucket ----- print(f"[INFO] Window: {start_time.isoformat()}Z → {end_time.isoformat()}Z") print(f"[INFO] Services: {target_services}") print(f"[INFO] Caps: per_service_buckets={MAX_BUCKETS_PER_SERVICE}, global_buckets={MAX_BUCKETS_GLOBAL}, per_service_fetch_limit={limit}") # Using a tuple (service, root_op, key) as the bucket key (no hashing). bucket_best_example = {} # (svc, root, key) -> example dict bucket_counts = defaultdict(int) # (svc, root, key) -> count service_bucket_count = defaultdict(int) global_bucket_order = [] # to enforce global cap in insertion order for svc in target_services: try: traces = fetch_traces(svc) if VERBOSE: print(f"[DEBUG] fetched {len(traces)} traces for {svc}") except Exception as e: print(f"[WARN] fetch failed for {svc}: {e}") continue for tr in traces: spans = tr.get("spans", []) if not spans: continue # Collect indicators across spans has_error_any = False indicators = set() http_status = None exception_type = None for sp in spans: err, ind, st, ex = find_error_indicators(sp) if err: has_error_any = True indicators |= ind if http_status is None and st is not None: http_status = st if exception_type is None and ex: exception_type = ex if REQUIRE_ERROR_TAG and not has_error_any: continue if not has_error_any and (http_status is None or http_status < MIN_STATUS_CODE): # neither explicit error nor status over threshold continue dur_ms = trace_duration_ms(tr) if MIN_DURATION_MS and dur_ms < MIN_DURATION_MS: continue root_op = trace_root_op(tr) start_iso = trace_start_iso(tr) key = error_key_for_bucket(http_status, exception_type, indicators) bucket_key = (svc, root_op, key) # <— tuple key, no hashing # Respect per-service bucket cap if bucket_key not in bucket_best_example and service_bucket_count[svc] >= MAX_BUCKETS_PER_SERVICE: continue # Build a compact example payload example = { "trace_id": tr.get("traceID"), "service": svc, "root_operation": root_op, "duration_ms": round(dur_ms, 2), "start_time": start_iso, "indicators": sorted(indicators)[:6] } # Choose the best example per bucket: prefer most recent, then longest cur = bucket_best_example.get(bucket_key) better = False if cur is None: better = True else: cur_ts = cur["start_time"] or "" new_ts = example["start_time"] or "" better = new_ts > cur_ts or (new_ts == cur_ts and example["duration_ms"] > cur["duration_ms"]) if better: bucket_best_example[bucket_key] = example if cur is None: service_bucket_count[svc] += 1 global_bucket_order.append(bucket_key) bucket_counts[bucket_key] += 1 # Enforce global cap (drop oldest bucket when exceeding) if len(global_bucket_order) > MAX_BUCKETS_GLOBAL: drop = global_bucket_order.pop(0) svc_drop = drop[0] service_bucket_count[svc_drop] = max(0, service_bucket_count[svc_drop] - 1) bucket_best_example.pop(drop, None) bucket_counts.pop(drop, None) # ----- Build compact outputs ----- def bucket_sort_key(k): svc, root, key = k ex = bucket_best_example[k] cnt = bucket_counts[k] ts = ex["start_time"] or "" return (svc, -cnt, ts) sorted_keys = sorted(bucket_best_example.keys(), key=bucket_sort_key) error_traces_compact = [] for (svc, root, key) in sorted_keys: ex = bucket_best_example[(svc, root, key)] error_traces_compact.append({ "signature": { "service": svc, "root_operation": root, "key": key, "signature_key": f"{svc}|{root}|{key}" # human-readable, no hashing }, "count": bucket_counts[(svc, root, key)], "example": ex }) # Per-service rollup service_rollup = defaultdict(lambda: {"buckets": 0, "total_matches": 0}) for (svc, root, key) in sorted_keys: service_rollup[svc]["buckets"] += 1 service_rollup[svc]["total_matches"] += bucket_counts[(svc, root, key)] output = { "window_minutes": lookback_minutes, "services": target_services, "caps": { "per_service_buckets": MAX_BUCKETS_PER_SERVICE, "global_buckets": MAX_BUCKETS_GLOBAL, "per_service_fetch_limit": limit }, "service_rollup": service_rollup, "error_traces": error_traces_compact, "error_traces_count": sum(bucket_counts.values()), "distinct_error_signatures": len(sorted_keys), } error_traces = output["error_traces"] print("\n==== OUTPUT (compact error traces) ====") print(json.dumps(output, indent=2))
copied