Summarize all recent exceptions and errors for a given set of service or services in jaeger.

Searches Jaeger for traces from specified services that contain errors (error=true) or HTTP status codes >= 400, then analyzes and counts exception types

Requires lookback_minutes (int), target_services (array), and limit as 5. Use lookback_minutes as 60 if not defined.

# Jaeger error/latency scan (compact output) # ------------------------------------------------------ # WHAT THIS DOES # - Scans recent traces per target service from Jaeger. # - Detects "problems" using span tags: # * otel.status_code == "ERROR" # * error == true # * http.status_code >= MIN_STATUS_CODE (default 400) # * grpc status code != 0 # * any "exception.*" tag # - Buckets similar problems by: (service, root_operation, error_key) # * error_key is one of: exception:<type> | http:<code> | grpc=<code> | otel_status=ERROR | error=true | unknown_error # - Keeps only the BEST example per bucket (most recent; if tie, longest duration) # - Caps how many buckets we keep globally and per service to prevent huge outputs. # # INPUTS expected in your runtime: # JAEGER_URL_OTEL : str (e.g., http://jaeger:16686 or http://jaeger:16686/ui) # lookback_minutes : int (e.g., 60) # target_services : list[str] (e.g., ["checkout","payment","frontend"]) # limit : int (per-service fetch cap, e.g., 100) # # OPTIONAL tuning knobs (set in your runner if you want, else defaults below): # MAX_BUCKETS_GLOBAL : int (default 50) -> total distinct error buckets to output # MAX_BUCKETS_PER_SERVICE : int (default 10) -> max buckets per service # MIN_STATUS_CODE : int (default 400) -> http status threshold to treat as error # MIN_DURATION_MS : int (default 0) -> only keep examples with duration >= this (0 = off) # REQUIRE_ERROR_TAG : bool (default False)-> if True, require explicit error tags (otel/error/grpc/exception) # VERBOSE : bool (default False)-> if True, prints extra progress logs # # OUTPUT SHAPE (printed as JSON): # { # "window_minutes": <int>, # "services": [ ... ], # "caps": { "per_service_buckets": <int>, "global_buckets": <int>, "per_service_fetch_limit": <int> }, # "service_rollup": { # "<service>": { "buckets": <int>, "total_matches": <int> }, # ... # }, # "error_traces": [ # { # "signature": { # "service": "<service>", # "root_operation": "<root operation name>", # "key": "<error key>", # e.g., "http:500", "exception:FooError", "grpc=14", "otel_status=ERROR", "error=true" # "signature_key": "<service>|<root_operation>|<key>" # readable bucket key (no hashing) # }, # "count": <int>, # how many traces matched this signature # "example": { # best example for this signature (trimmed) # "trace_id": "<traceID>", # "service": "<service>", # "root_operation": "<root operation name>", # "duration_ms": <float>, # "start_time": "<ISO8601 UTC>", # "indicators": ["http=500", "grpc=14", "otel_status=ERROR", ...] # up to ~6 tags, trimmed # } # }, # ... # ], # "error_traces_count": <int>, # sum of counts across all buckets # "distinct_error_signatures": <int> # number of buckets kept after caps # } # # WHY error_traces IS SMALL: # - We group similar errors and keep ONE best example per group. # - We cap the number of groups globally and per service. # - We trim long indicator lists. import json import requests from datetime import datetime, timedelta from collections import defaultdict error_traces = [] # ----- Inputs from your runner / env ----- JAEGER_URL = getEnvVar("JAEGER_URL_OTEL").rstrip("/") # Required inputs (provided by your tool form / runner) lookback_minutes = int(lookback_minutes) # e.g., 60 target_services = target_services # e.g., ["checkout","payment","frontend"] limit = int(limit) # e.g., 100 # Optional knobs with sensible defaults try: MAX_BUCKETS_GLOBAL = int(MAX_BUCKETS_GLOBAL) except: MAX_BUCKETS_GLOBAL = 50 try: MAX_BUCKETS_PER_SERVICE = int(MAX_BUCKETS_PER_SERVICE) except: MAX_BUCKETS_PER_SERVICE = 10 try: MIN_STATUS_CODE = int(MIN_STATUS_CODE) except: MIN_STATUS_CODE = 400 try: MIN_DURATION_MS = int(MIN_DURATION_MS) except: MIN_DURATION_MS = 0 try: REQUIRE_ERROR_TAG = bool(REQUIRE_ERROR_TAG) except: REQUIRE_ERROR_TAG = False try: VERBOSE = bool(VERBOSE) except: VERBOSE = False # ----- Helpers ----- def api_base(url: str) -> str: # Accepts http(s)://host[:port], /ui, /jaeger variants and returns .../api if url.endswith("/api"): return url if url.endswith("/ui"): return url + "/api" if url.endswith("/jaeger"): return url + "/ui/api" return url + "/api" BASE = api_base(JAEGER_URL) def to_us(dt: datetime) -> int: return int(dt.timestamp() * 1_000_000) end_time = datetime.utcnow() start_time = end_time - timedelta(minutes=lookback_minutes) start_us, end_us = to_us(start_time), to_us(end_time) def safe_int(v): try: return int(v) except Exception: try: return int(str(v)) except Exception: return None def find_error_indicators(span): """ Return (has_error, indicators:set[str], http_status:int|None, exception_type:str|None) Indicators include: otel_status=ERROR, error=true, http=xxx, grpc=xxx, exception=<type> """ has_err = False ind = set() status = None ex_type = None for t in span.get("tags", []): k, v = t.get("key"), t.get("value") sv = str(v) if v is not None else "" if k == "otel.status_code" and sv.upper() == "ERROR": has_err = True; ind.add("otel_status=ERROR") elif k == "error": if (isinstance(v, bool) and v) or sv.lower() == "true": has_err = True; ind.add("error=true") elif k in ("http.status_code", "http.response.status_code"): code = safe_int(v) if code is not None: ind.add(f"http={code}") if status is None: status = code if code >= MIN_STATUS_CODE: has_err = True elif k in ("rpc.grpc.status_code", "grpc.code", "grpc.status_code"): g = safe_int(v) if g is not None: ind.add(f"grpc={g}") if g != 0: has_err = True elif "exception" in (k or "").lower(): if sv: ex_type = ex_type or sv ind.add(f"exception={sv[:80]}") has_err = True return has_err, ind, status, ex_type def trace_root_op(trace): spans = trace.get("spans", []) if not spans: return "unknown" root = next((s for s in spans if not s.get("references")), None) return (root or spans[0]).get("operationName", "unknown") def trace_duration_ms(trace): spans = trace.get("spans", []) if not spans: return 0.0 return max((sp.get("duration", 0) or 0) / 1000.0 for sp in spans) def trace_start_iso(trace): spans = trace.get("spans", []) if not spans: return None st = min(sp.get("startTime", 0) for sp in spans) return datetime.utcfromtimestamp(st / 1_000_000).isoformat() + "Z" def error_key_for_bucket(http_status, exception_type, indicators): """ Priority for error_key: 1) exception:<type> 2) http:<code> (if >= MIN_STATUS_CODE) 3) grpc=<code> 4) otel_status=ERROR 5) error=true 6) unknown_error """ if exception_type: return f"exception:{exception_type}" if http_status is not None and http_status >= MIN_STATUS_CODE: return f"http:{http_status}" for i in indicators: if i.startswith("grpc="): return i if "otel_status=ERROR" in indicators: return "otel_status=ERROR" if "error=true" in indicators: return "error=true" return "unknown_error" # def fetch_traces(service): # params = { # "service": service, # "start": start_us, # "end": end_us, # "limit": limit, # } # r = requests.get(f"{BASE}/traces", params=params, timeout=6) # r.raise_for_status() # data = r.json() # return data.get("data", data) def fetch_traces(service): params = { "service": service, "start": start_us, "end": end_us, "limit": limit, } try: resp = requests.get(f"{BASE}/traces", params=params, timeout=6) resp.raise_for_status() data = resp.json() return data.get("data", data) except requests.exceptions.RequestException as e: print(f"[WARN] Error fetching traces for {service}: {e}") return [] except Exception as e: print(f"[WARN] Error processing Jaeger response for {service}: {e}") return [] # ----- Scan & bucket ----- print(f"[INFO] Window: {start_time.isoformat()}Z → {end_time.isoformat()}Z") print(f"[INFO] Services: {target_services}") print(f"[INFO] Caps: per_service_buckets={MAX_BUCKETS_PER_SERVICE}, global_buckets={MAX_BUCKETS_GLOBAL}, per_service_fetch_limit={limit}") # Using a tuple (service, root_op, key) as the bucket key (no hashing). bucket_best_example = {} # (svc, root, key) -> example dict bucket_counts = defaultdict(int) # (svc, root, key) -> count service_bucket_count = defaultdict(int) global_bucket_order = [] # to enforce global cap in insertion order for svc in target_services: try: traces = fetch_traces(svc) if VERBOSE: print(f"[DEBUG] fetched {len(traces)} traces for {svc}") except Exception as e: print(f"[WARN] fetch failed for {svc}: {e}") continue for tr in traces: spans = tr.get("spans", []) if not spans: continue # Collect indicators across spans has_error_any = False indicators = set() http_status = None exception_type = None for sp in spans: err, ind, st, ex = find_error_indicators(sp) if err: has_error_any = True indicators |= ind if http_status is None and st is not None: http_status = st if exception_type is None and ex: exception_type = ex if REQUIRE_ERROR_TAG and not has_error_any: continue if not has_error_any and (http_status is None or http_status < MIN_STATUS_CODE): # neither explicit error nor status over threshold continue dur_ms = trace_duration_ms(tr) if MIN_DURATION_MS and dur_ms < MIN_DURATION_MS: continue root_op = trace_root_op(tr) start_iso = trace_start_iso(tr) key = error_key_for_bucket(http_status, exception_type, indicators) bucket_key = (svc, root_op, key) # <— tuple key, no hashing # Respect per-service bucket cap if bucket_key not in bucket_best_example and service_bucket_count[svc] >= MAX_BUCKETS_PER_SERVICE: continue # Build a compact example payload example = { "trace_id": tr.get("traceID"), "service": svc, "root_operation": root_op, "duration_ms": round(dur_ms, 2), "start_time": start_iso, "indicators": sorted(indicators)[:6] } # Choose the best example per bucket: prefer most recent, then longest cur = bucket_best_example.get(bucket_key) better = False if cur is None: better = True else: cur_ts = cur["start_time"] or "" new_ts = example["start_time"] or "" better = new_ts > cur_ts or (new_ts == cur_ts and example["duration_ms"] > cur["duration_ms"]) if better: bucket_best_example[bucket_key] = example if cur is None: service_bucket_count[svc] += 1 global_bucket_order.append(bucket_key) bucket_counts[bucket_key] += 1 # Enforce global cap (drop oldest bucket when exceeding) if len(global_bucket_order) > MAX_BUCKETS_GLOBAL: drop = global_bucket_order.pop(0) svc_drop = drop[0] service_bucket_count[svc_drop] = max(0, service_bucket_count[svc_drop] - 1) bucket_best_example.pop(drop, None) bucket_counts.pop(drop, None) # ----- Build compact outputs ----- def bucket_sort_key(k): svc, root, key = k ex = bucket_best_example[k] cnt = bucket_counts[k] ts = ex["start_time"] or "" return (svc, -cnt, ts) sorted_keys = sorted(bucket_best_example.keys(), key=bucket_sort_key) error_traces_compact = [] for (svc, root, key) in sorted_keys: ex = bucket_best_example[(svc, root, key)] error_traces_compact.append({ "signature": { "service": svc, "root_operation": root, "key": key, "signature_key": f"{svc}|{root}|{key}" # human-readable, no hashing }, "count": bucket_counts[(svc, root, key)], "example": ex }) # Per-service rollup service_rollup = defaultdict(lambda: {"buckets": 0, "total_matches": 0}) for (svc, root, key) in sorted_keys: service_rollup[svc]["buckets"] += 1 service_rollup[svc]["total_matches"] += bucket_counts[(svc, root, key)] output = { "window_minutes": lookback_minutes, "services": target_services, "caps": { "per_service_buckets": MAX_BUCKETS_PER_SERVICE, "global_buckets": MAX_BUCKETS_GLOBAL, "per_service_fetch_limit": limit }, "service_rollup": service_rollup, "error_traces": error_traces_compact, "error_traces_count": sum(bucket_counts.values()), "distinct_error_signatures": len(sorted_keys), } error_traces = output["error_traces"] print("\n==== OUTPUT (compact error traces) ====") print(json.dumps(output, indent=2))

copied

tooltask

Summarize all recent exceptions and errors for a given set of service or services in jaeger.

Resources

Company

Legal