github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/logger.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # cython: language_level=3
    19  
    20  """Python worker logging."""
    21  
    22  # pytype: skip-file
    23  # mypy: disallow-untyped-defs
    24  
    25  import contextlib
    26  import json
    27  import logging
    28  import threading
    29  import traceback
    30  from typing import Any
    31  from typing import Dict
    32  from typing import Iterator
    33  from typing import List
    34  
    35  from apache_beam.runners.worker import statesampler
    36  
    37  
    38  # Per-thread worker information. This is used only for logging to set
    39  # context information that changes while work items get executed:
    40  # work_item_id, step_name, stage_name.
    41  class _PerThreadWorkerData(threading.local):
    42    def __init__(self):
    43      # type: () -> None
    44      super().__init__()
    45      # in the list, as going up and down all the way to zero incurs several
    46      # reallocations.
    47      self.stack = []  # type: List[Dict[str, Any]]
    48  
    49    def get_data(self):
    50      # type: () -> Dict[str, Any]
    51      all_data = {}
    52      for datum in self.stack:
    53        all_data.update(datum)
    54      return all_data
    55  
    56  
    57  per_thread_worker_data = _PerThreadWorkerData()
    58  
    59  
    60  @contextlib.contextmanager
    61  def PerThreadLoggingContext(**kwargs):
    62    # type: (**Any) -> Iterator[None]
    63  
    64    """A context manager to add per thread attributes."""
    65    stack = per_thread_worker_data.stack
    66    stack.append(kwargs)
    67    yield
    68    stack.pop()
    69  
    70  
    71  class JsonLogFormatter(logging.Formatter):
    72    """A JSON formatter class as expected by the logging standard module."""
    73    def __init__(self, job_id, worker_id):
    74      # type: (str, str) -> None
    75      super().__init__()
    76      self.job_id = job_id
    77      self.worker_id = worker_id
    78  
    79    def format(self, record):
    80      # type: (logging.LogRecord) -> str
    81  
    82      """Returns a JSON string based on a LogRecord instance.
    83  
    84      Args:
    85        record: A LogRecord instance. See below for details.
    86  
    87      Returns:
    88        A JSON string representing the record.
    89  
    90      A LogRecord instance has the following attributes and is used for
    91      formatting the final message.
    92  
    93      Attributes:
    94        created: A double representing the timestamp for record creation
    95          (e.g., 1438365207.624597). Note that the number contains also msecs and
    96          microsecs information. Part of this is also available in the 'msecs'
    97          attribute.
    98        msecs: A double representing the msecs part of the record creation
    99          (e.g., 624.5970726013184).
   100        msg: Logging message containing formatting instructions or an arbitrary
   101          object. This is the first argument of a log call.
   102        args: A tuple containing the positional arguments for the logging call.
   103        levelname: A string. Possible values are: INFO, WARNING, ERROR, etc.
   104        exc_info: None or a 3-tuple with exception information as it is
   105          returned by a call to sys.exc_info().
   106        name: Logger's name. Most logging is done using the default root logger
   107          and therefore the name will be 'root'.
   108        filename: Basename of the file where logging occurred.
   109        funcName: Name of the function where logging occurred.
   110        process: The PID of the process running the worker.
   111        thread:  An id for the thread where the record was logged. This is not a
   112          real TID (the one provided by OS) but rather the id (address) of a
   113          Python thread object. Nevertheless having this value can allow to
   114          filter log statement from only one specific thread.
   115      """
   116      output = {}  # type: Dict[str, Any]
   117      output['timestamp'] = {
   118          'seconds': int(record.created), 'nanos': int(record.msecs * 1000000)
   119      }
   120      # ERROR. INFO, DEBUG log levels translate into the same for severity
   121      # property. WARNING becomes WARN.
   122      output['severity'] = (
   123          record.levelname if record.levelname != 'WARNING' else 'WARN')
   124  
   125      # msg could be an arbitrary object, convert it to a string first.
   126      record_msg = str(record.msg)
   127  
   128      # Prepare the actual message using the message formatting string and the
   129      # positional arguments as they have been used in the log call.
   130      if record.args:
   131        try:
   132          output['message'] = record_msg % record.args
   133        except (TypeError, ValueError):
   134          output['message'] = '%s with args (%s)' % (record_msg, record.args)
   135      else:
   136        output['message'] = record_msg
   137  
   138      # The thread ID is logged as a combination of the process ID and thread ID
   139      # since workers can run in multiple processes.
   140      output['thread'] = '%s:%s' % (record.process, record.thread)
   141      # job ID and worker ID. These do not change during the lifetime of a worker.
   142      output['job'] = self.job_id
   143      output['worker'] = self.worker_id
   144      # Stage, step and work item ID come from thread local storage since they
   145      # change with every new work item leased for execution. If there is no
   146      # work item ID then we make sure the step is undefined too.
   147      data = per_thread_worker_data.get_data()
   148      if 'work_item_id' in data:
   149        output['work'] = data['work_item_id']
   150  
   151      tracker = statesampler.get_current_tracker()
   152      if tracker:
   153        output['stage'] = tracker.stage_name
   154  
   155        if tracker.current_state() and tracker.current_state().name_context:
   156          output['step'] = tracker.current_state().name_context.logging_name()
   157  
   158      # All logging happens using the root logger. We will add the basename of the
   159      # file and the function name where the logging happened to make it easier
   160      # to identify who generated the record.
   161      output['logger'] = '%s:%s:%s' % (
   162          record.name, record.filename, record.funcName)
   163      # Add exception information if any is available.
   164      if record.exc_info:
   165        output['exception'] = ''.join(
   166            traceback.format_exception(*record.exc_info))
   167  
   168      return json.dumps(output)
   169  
   170  
   171  def initialize(job_id, worker_id, log_path, log_level=logging.INFO):
   172    # type: (str, str, str, int) -> None
   173  
   174    """Initialize root logger so that we log JSON to a file and text to stdout."""
   175  
   176    file_handler = logging.FileHandler(log_path)
   177    file_handler.setFormatter(JsonLogFormatter(job_id, worker_id))
   178    logging.getLogger().addHandler(file_handler)
   179  
   180    # Default level is set to INFO to avoid logging various DEBUG level log calls
   181    # sprinkled throughout the code.
   182    logging.getLogger().setLevel(log_level)