github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/worker_status.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Worker status api handler for reporting SDK harness debug info.""" 19 20 import logging 21 import queue 22 import sys 23 import threading 24 import time 25 import traceback 26 from collections import defaultdict 27 28 import grpc 29 30 from apache_beam.portability.api import beam_fn_api_pb2 31 from apache_beam.portability.api import beam_fn_api_pb2_grpc 32 from apache_beam.runners.worker.channel_factory import GRPCChannelFactory 33 from apache_beam.runners.worker.statecache import StateCache 34 from apache_beam.runners.worker.worker_id_interceptor import WorkerIdInterceptor 35 from apache_beam.utils.sentinel import Sentinel 36 37 try: 38 from guppy import hpy 39 except ImportError: 40 hpy = None 41 42 _LOGGER = logging.getLogger(__name__) 43 44 # This SDK harness will (by default), log a "lull" in processing if it sees no 45 # transitions in over 5 minutes. 46 # 5 minutes * 60 seconds * 1000 millis * 1000 micros * 1000 nanoseconds 47 DEFAULT_LOG_LULL_TIMEOUT_NS = 5 * 60 * 1000 * 1000 * 1000 48 49 # Full thread dump is performed at most every 20 minutes. 50 LOG_LULL_FULL_THREAD_DUMP_INTERVAL_S = 20 * 60 51 52 # Full thread dump is performed if the lull is more than 20 minutes. 53 LOG_LULL_FULL_THREAD_DUMP_LULL_S = 20 * 60 54 55 56 def thread_dump(): 57 """Get a thread dump for the current SDK worker harness. """ 58 # deduplicate threads with same stack trace 59 stack_traces = defaultdict(list) 60 frames = sys._current_frames() # pylint: disable=protected-access 61 62 for t in threading.enumerate(): 63 try: 64 stack_trace = ''.join(traceback.format_stack(frames[t.ident])) 65 except KeyError: 66 # the thread may have been destroyed already while enumerating, in such 67 # case, skip to next thread. 68 continue 69 thread_ident_name = (t.ident, t.name) 70 stack_traces[stack_trace].append(thread_ident_name) 71 72 all_traces = ['=' * 10 + ' THREAD DUMP ' + '=' * 10] 73 for stack, identity in stack_traces.items(): 74 ident, name = identity[0] 75 trace = '--- Thread #%s name: %s %s---\n' % ( 76 ident, 77 name, 78 'and other %d threads' % 79 (len(identity) - 1) if len(identity) > 1 else '') 80 if len(identity) > 1: 81 trace += 'threads: %s\n' % identity 82 trace += stack 83 all_traces.append(trace) 84 all_traces.append('=' * 30) 85 return '\n'.join(all_traces) 86 87 88 def heap_dump(): 89 """Get a heap dump for the current SDK worker harness. """ 90 banner = '=' * 10 + ' HEAP DUMP ' + '=' * 10 + '\n' 91 if not hpy: 92 heap = 'Unable to import guppy, the heap dump will be skipped.\n' 93 else: 94 heap = '%s\n' % hpy().heap() 95 ending = '=' * 30 96 return banner + heap + ending 97 98 99 def _state_cache_stats(state_cache): 100 #type: (StateCache) -> str 101 102 """Gather state cache statistics.""" 103 cache_stats = ['=' * 10 + ' CACHE STATS ' + '=' * 10] 104 if not state_cache.is_cache_enabled(): 105 cache_stats.append("Cache disabled") 106 else: 107 cache_stats.append(state_cache.describe_stats()) 108 return '\n'.join(cache_stats) 109 110 111 def _active_processing_bundles_state(bundle_process_cache): 112 """Gather information about the currently in-processing active bundles. 113 114 The result only keeps the longest lasting 10 bundles to avoid excessive 115 spamming. 116 """ 117 active_bundles = ['=' * 10 + ' ACTIVE PROCESSING BUNDLES ' + '=' * 10] 118 if not bundle_process_cache.active_bundle_processors: 119 active_bundles.append("No active processing bundles.") 120 else: 121 cache = [] 122 for instruction in list( 123 bundle_process_cache.active_bundle_processors.keys()): 124 processor = bundle_process_cache.lookup(instruction) 125 if processor: 126 info = processor.state_sampler.get_info() 127 cache.append(( 128 instruction, 129 processor.process_bundle_descriptor.id, 130 info.tracked_thread, 131 info.time_since_transition)) 132 # reverse sort active bundle by time since last transition, keep top 10. 133 cache.sort(key=lambda x: x[-1], reverse=True) 134 for s in cache[:10]: 135 state = '--- instruction %s ---\n' % s[0] 136 state += 'ProcessBundleDescriptorId: %s\n' % s[1] 137 state += "tracked thread: %s\n" % s[2] 138 state += "time since transition: %.2f seconds\n" % (s[3] / 1e9) 139 active_bundles.append(state) 140 141 active_bundles.append('=' * 30) 142 return '\n'.join(active_bundles) 143 144 145 DONE = Sentinel.sentinel 146 147 148 class FnApiWorkerStatusHandler(object): 149 """FnApiWorkerStatusHandler handles worker status request from Runner. """ 150 def __init__( 151 self, 152 status_address, 153 bundle_process_cache=None, 154 state_cache=None, 155 enable_heap_dump=False, 156 log_lull_timeout_ns=DEFAULT_LOG_LULL_TIMEOUT_NS): 157 """Initialize FnApiWorkerStatusHandler. 158 159 Args: 160 status_address: The URL Runner uses to host the WorkerStatus server. 161 bundle_process_cache: The BundleProcessor cache dict from sdk worker. 162 state_cache: The StateCache form sdk worker. 163 """ 164 self._alive = True 165 self._bundle_process_cache = bundle_process_cache 166 self._state_cache = state_cache 167 ch = GRPCChannelFactory.insecure_channel(status_address) 168 grpc.channel_ready_future(ch).result(timeout=60) 169 self._status_channel = grpc.intercept_channel(ch, WorkerIdInterceptor()) 170 self._status_stub = beam_fn_api_pb2_grpc.BeamFnWorkerStatusStub( 171 self._status_channel) 172 self._responses = queue.Queue() 173 self.log_lull_timeout_ns = log_lull_timeout_ns 174 self._last_full_thread_dump_secs = 0.0 175 self._last_lull_logged_secs = 0.0 176 self._server = threading.Thread( 177 target=lambda: self._serve(), name='fn_api_status_handler') 178 self._server.daemon = True 179 self._enable_heap_dump = enable_heap_dump 180 self._server.start() 181 self._lull_logger = threading.Thread( 182 target=lambda: self._log_lull_in_bundle_processor( 183 self._bundle_process_cache), 184 name='lull_operation_logger') 185 self._lull_logger.daemon = True 186 self._lull_logger.start() 187 188 def _get_responses(self): 189 while True: 190 response = self._responses.get() 191 if response is DONE: 192 self._alive = False 193 return 194 yield response 195 196 def _serve(self): 197 while self._alive: 198 for request in self._status_stub.WorkerStatus(self._get_responses()): 199 try: 200 self._responses.put( 201 beam_fn_api_pb2.WorkerStatusResponse( 202 id=request.id, status_info=self.generate_status_response())) 203 except Exception: 204 traceback_string = traceback.format_exc() 205 self._responses.put( 206 beam_fn_api_pb2.WorkerStatusResponse( 207 id=request.id, 208 error="Exception encountered while generating " 209 "status page: %s" % traceback_string)) 210 211 def generate_status_response(self): 212 all_status_sections = [] 213 214 if self._state_cache: 215 all_status_sections.append(_state_cache_stats(self._state_cache)) 216 217 if self._bundle_process_cache: 218 all_status_sections.append( 219 _active_processing_bundles_state(self._bundle_process_cache)) 220 221 all_status_sections.append(thread_dump()) 222 if self._enable_heap_dump: 223 all_status_sections.append(heap_dump()) 224 225 return '\n'.join(all_status_sections) 226 227 def close(self): 228 self._responses.put(DONE, timeout=5) 229 230 def _log_lull_in_bundle_processor(self, bundle_process_cache): 231 while True: 232 time.sleep(2 * 60) 233 if bundle_process_cache and bundle_process_cache.active_bundle_processors: 234 for instruction in list( 235 bundle_process_cache.active_bundle_processors.keys()): 236 processor = bundle_process_cache.lookup(instruction) 237 if processor: 238 info = processor.state_sampler.get_info() 239 self._log_lull_sampler_info(info) 240 241 def _log_lull_sampler_info(self, sampler_info): 242 if not self._passed_lull_timeout_since_last_log(): 243 return 244 if (sampler_info and sampler_info.time_since_transition and 245 sampler_info.time_since_transition > self.log_lull_timeout_ns): 246 step_name = sampler_info.state_name.step_name 247 state_name = sampler_info.state_name.name 248 lull_seconds = sampler_info.time_since_transition / 1e9 249 state_lull_log = ( 250 'Operation ongoing for over %.2f seconds in state %s' % 251 (lull_seconds, state_name)) 252 step_name_log = (' in step %s ' % step_name) if step_name else '' 253 254 exec_thread = getattr(sampler_info, 'tracked_thread', None) 255 if exec_thread is not None: 256 thread_frame = sys._current_frames().get(exec_thread.ident) # pylint: disable=protected-access 257 stack_trace = '\n'.join( 258 traceback.format_stack(thread_frame)) if thread_frame else '' 259 else: 260 stack_trace = '-NOT AVAILABLE-' 261 262 _LOGGER.warning( 263 '%s%s without returning. Current Traceback:\n%s', 264 state_lull_log, 265 step_name_log, 266 stack_trace) 267 268 def _passed_lull_timeout_since_last_log(self) -> bool: 269 if (time.time() - self._last_lull_logged_secs > 270 self.log_lull_timeout_ns / 1e9): 271 self._last_lull_logged_secs = time.time() 272 return True 273 else: 274 return False