github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/worker_status.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Worker status api handler for reporting SDK harness debug info."""
    19  
    20  import logging
    21  import queue
    22  import sys
    23  import threading
    24  import time
    25  import traceback
    26  from collections import defaultdict
    27  
    28  import grpc
    29  
    30  from apache_beam.portability.api import beam_fn_api_pb2
    31  from apache_beam.portability.api import beam_fn_api_pb2_grpc
    32  from apache_beam.runners.worker.channel_factory import GRPCChannelFactory
    33  from apache_beam.runners.worker.statecache import StateCache
    34  from apache_beam.runners.worker.worker_id_interceptor import WorkerIdInterceptor
    35  from apache_beam.utils.sentinel import Sentinel
    36  
    37  try:
    38    from guppy import hpy
    39  except ImportError:
    40    hpy = None
    41  
    42  _LOGGER = logging.getLogger(__name__)
    43  
    44  # This SDK harness will (by default), log a "lull" in processing if it sees no
    45  # transitions in over 5 minutes.
    46  # 5 minutes * 60 seconds * 1000 millis * 1000 micros * 1000 nanoseconds
    47  DEFAULT_LOG_LULL_TIMEOUT_NS = 5 * 60 * 1000 * 1000 * 1000
    48  
    49  # Full thread dump is performed at most every 20 minutes.
    50  LOG_LULL_FULL_THREAD_DUMP_INTERVAL_S = 20 * 60
    51  
    52  # Full thread dump is performed if the lull is more than 20 minutes.
    53  LOG_LULL_FULL_THREAD_DUMP_LULL_S = 20 * 60
    54  
    55  
    56  def thread_dump():
    57    """Get a thread dump for the current SDK worker harness. """
    58    # deduplicate threads with same stack trace
    59    stack_traces = defaultdict(list)
    60    frames = sys._current_frames()  # pylint: disable=protected-access
    61  
    62    for t in threading.enumerate():
    63      try:
    64        stack_trace = ''.join(traceback.format_stack(frames[t.ident]))
    65      except KeyError:
    66        # the thread may have been destroyed already while enumerating, in such
    67        # case, skip to next thread.
    68        continue
    69      thread_ident_name = (t.ident, t.name)
    70      stack_traces[stack_trace].append(thread_ident_name)
    71  
    72    all_traces = ['=' * 10 + ' THREAD DUMP ' + '=' * 10]
    73    for stack, identity in stack_traces.items():
    74      ident, name = identity[0]
    75      trace = '--- Thread #%s name: %s %s---\n' % (
    76          ident,
    77          name,
    78          'and other %d threads' %
    79          (len(identity) - 1) if len(identity) > 1 else '')
    80      if len(identity) > 1:
    81        trace += 'threads: %s\n' % identity
    82      trace += stack
    83      all_traces.append(trace)
    84    all_traces.append('=' * 30)
    85    return '\n'.join(all_traces)
    86  
    87  
    88  def heap_dump():
    89    """Get a heap dump for the current SDK worker harness. """
    90    banner = '=' * 10 + ' HEAP DUMP ' + '=' * 10 + '\n'
    91    if not hpy:
    92      heap = 'Unable to import guppy, the heap dump will be skipped.\n'
    93    else:
    94      heap = '%s\n' % hpy().heap()
    95    ending = '=' * 30
    96    return banner + heap + ending
    97  
    98  
    99  def _state_cache_stats(state_cache):
   100    #type: (StateCache) -> str
   101  
   102    """Gather state cache statistics."""
   103    cache_stats = ['=' * 10 + ' CACHE STATS ' + '=' * 10]
   104    if not state_cache.is_cache_enabled():
   105      cache_stats.append("Cache disabled")
   106    else:
   107      cache_stats.append(state_cache.describe_stats())
   108    return '\n'.join(cache_stats)
   109  
   110  
   111  def _active_processing_bundles_state(bundle_process_cache):
   112    """Gather information about the currently in-processing active bundles.
   113  
   114    The result only keeps the longest lasting 10 bundles to avoid excessive
   115    spamming.
   116    """
   117    active_bundles = ['=' * 10 + ' ACTIVE PROCESSING BUNDLES ' + '=' * 10]
   118    if not bundle_process_cache.active_bundle_processors:
   119      active_bundles.append("No active processing bundles.")
   120    else:
   121      cache = []
   122      for instruction in list(
   123          bundle_process_cache.active_bundle_processors.keys()):
   124        processor = bundle_process_cache.lookup(instruction)
   125        if processor:
   126          info = processor.state_sampler.get_info()
   127          cache.append((
   128              instruction,
   129              processor.process_bundle_descriptor.id,
   130              info.tracked_thread,
   131              info.time_since_transition))
   132      # reverse sort active bundle by time since last transition, keep top 10.
   133      cache.sort(key=lambda x: x[-1], reverse=True)
   134      for s in cache[:10]:
   135        state = '--- instruction %s ---\n' % s[0]
   136        state += 'ProcessBundleDescriptorId: %s\n' % s[1]
   137        state += "tracked thread: %s\n" % s[2]
   138        state += "time since transition: %.2f seconds\n" % (s[3] / 1e9)
   139        active_bundles.append(state)
   140  
   141    active_bundles.append('=' * 30)
   142    return '\n'.join(active_bundles)
   143  
   144  
   145  DONE = Sentinel.sentinel
   146  
   147  
   148  class FnApiWorkerStatusHandler(object):
   149    """FnApiWorkerStatusHandler handles worker status request from Runner. """
   150    def __init__(
   151        self,
   152        status_address,
   153        bundle_process_cache=None,
   154        state_cache=None,
   155        enable_heap_dump=False,
   156        log_lull_timeout_ns=DEFAULT_LOG_LULL_TIMEOUT_NS):
   157      """Initialize FnApiWorkerStatusHandler.
   158  
   159      Args:
   160        status_address: The URL Runner uses to host the WorkerStatus server.
   161        bundle_process_cache: The BundleProcessor cache dict from sdk worker.
   162        state_cache: The StateCache form sdk worker.
   163      """
   164      self._alive = True
   165      self._bundle_process_cache = bundle_process_cache
   166      self._state_cache = state_cache
   167      ch = GRPCChannelFactory.insecure_channel(status_address)
   168      grpc.channel_ready_future(ch).result(timeout=60)
   169      self._status_channel = grpc.intercept_channel(ch, WorkerIdInterceptor())
   170      self._status_stub = beam_fn_api_pb2_grpc.BeamFnWorkerStatusStub(
   171          self._status_channel)
   172      self._responses = queue.Queue()
   173      self.log_lull_timeout_ns = log_lull_timeout_ns
   174      self._last_full_thread_dump_secs = 0.0
   175      self._last_lull_logged_secs = 0.0
   176      self._server = threading.Thread(
   177          target=lambda: self._serve(), name='fn_api_status_handler')
   178      self._server.daemon = True
   179      self._enable_heap_dump = enable_heap_dump
   180      self._server.start()
   181      self._lull_logger = threading.Thread(
   182          target=lambda: self._log_lull_in_bundle_processor(
   183              self._bundle_process_cache),
   184          name='lull_operation_logger')
   185      self._lull_logger.daemon = True
   186      self._lull_logger.start()
   187  
   188    def _get_responses(self):
   189      while True:
   190        response = self._responses.get()
   191        if response is DONE:
   192          self._alive = False
   193          return
   194        yield response
   195  
   196    def _serve(self):
   197      while self._alive:
   198        for request in self._status_stub.WorkerStatus(self._get_responses()):
   199          try:
   200            self._responses.put(
   201                beam_fn_api_pb2.WorkerStatusResponse(
   202                    id=request.id, status_info=self.generate_status_response()))
   203          except Exception:
   204            traceback_string = traceback.format_exc()
   205            self._responses.put(
   206                beam_fn_api_pb2.WorkerStatusResponse(
   207                    id=request.id,
   208                    error="Exception encountered while generating "
   209                    "status page: %s" % traceback_string))
   210  
   211    def generate_status_response(self):
   212      all_status_sections = []
   213  
   214      if self._state_cache:
   215        all_status_sections.append(_state_cache_stats(self._state_cache))
   216  
   217      if self._bundle_process_cache:
   218        all_status_sections.append(
   219            _active_processing_bundles_state(self._bundle_process_cache))
   220  
   221      all_status_sections.append(thread_dump())
   222      if self._enable_heap_dump:
   223        all_status_sections.append(heap_dump())
   224  
   225      return '\n'.join(all_status_sections)
   226  
   227    def close(self):
   228      self._responses.put(DONE, timeout=5)
   229  
   230    def _log_lull_in_bundle_processor(self, bundle_process_cache):
   231      while True:
   232        time.sleep(2 * 60)
   233        if bundle_process_cache and bundle_process_cache.active_bundle_processors:
   234          for instruction in list(
   235              bundle_process_cache.active_bundle_processors.keys()):
   236            processor = bundle_process_cache.lookup(instruction)
   237            if processor:
   238              info = processor.state_sampler.get_info()
   239              self._log_lull_sampler_info(info)
   240  
   241    def _log_lull_sampler_info(self, sampler_info):
   242      if not self._passed_lull_timeout_since_last_log():
   243        return
   244      if (sampler_info and sampler_info.time_since_transition and
   245          sampler_info.time_since_transition > self.log_lull_timeout_ns):
   246        step_name = sampler_info.state_name.step_name
   247        state_name = sampler_info.state_name.name
   248        lull_seconds = sampler_info.time_since_transition / 1e9
   249        state_lull_log = (
   250            'Operation ongoing for over %.2f seconds in state %s' %
   251            (lull_seconds, state_name))
   252        step_name_log = (' in step %s ' % step_name) if step_name else ''
   253  
   254        exec_thread = getattr(sampler_info, 'tracked_thread', None)
   255        if exec_thread is not None:
   256          thread_frame = sys._current_frames().get(exec_thread.ident)  # pylint: disable=protected-access
   257          stack_trace = '\n'.join(
   258              traceback.format_stack(thread_frame)) if thread_frame else ''
   259        else:
   260          stack_trace = '-NOT AVAILABLE-'
   261  
   262        _LOGGER.warning(
   263            '%s%s without returning. Current Traceback:\n%s',
   264            state_lull_log,
   265            step_name_log,
   266            stack_trace)
   267  
   268    def _passed_lull_timeout_since_last_log(self) -> bool:
   269      if (time.time() - self._last_lull_logged_secs >
   270          self.log_lull_timeout_ns / 1e9):
   271        self._last_lull_logged_secs = time.time()
   272        return True
   273      else:
   274        return False