github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/logger.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # cython: language_level=3 19 20 """Python worker logging.""" 21 22 # pytype: skip-file 23 # mypy: disallow-untyped-defs 24 25 import contextlib 26 import json 27 import logging 28 import threading 29 import traceback 30 from typing import Any 31 from typing import Dict 32 from typing import Iterator 33 from typing import List 34 35 from apache_beam.runners.worker import statesampler 36 37 38 # Per-thread worker information. This is used only for logging to set 39 # context information that changes while work items get executed: 40 # work_item_id, step_name, stage_name. 41 class _PerThreadWorkerData(threading.local): 42 def __init__(self): 43 # type: () -> None 44 super().__init__() 45 # in the list, as going up and down all the way to zero incurs several 46 # reallocations. 47 self.stack = [] # type: List[Dict[str, Any]] 48 49 def get_data(self): 50 # type: () -> Dict[str, Any] 51 all_data = {} 52 for datum in self.stack: 53 all_data.update(datum) 54 return all_data 55 56 57 per_thread_worker_data = _PerThreadWorkerData() 58 59 60 @contextlib.contextmanager 61 def PerThreadLoggingContext(**kwargs): 62 # type: (**Any) -> Iterator[None] 63 64 """A context manager to add per thread attributes.""" 65 stack = per_thread_worker_data.stack 66 stack.append(kwargs) 67 yield 68 stack.pop() 69 70 71 class JsonLogFormatter(logging.Formatter): 72 """A JSON formatter class as expected by the logging standard module.""" 73 def __init__(self, job_id, worker_id): 74 # type: (str, str) -> None 75 super().__init__() 76 self.job_id = job_id 77 self.worker_id = worker_id 78 79 def format(self, record): 80 # type: (logging.LogRecord) -> str 81 82 """Returns a JSON string based on a LogRecord instance. 83 84 Args: 85 record: A LogRecord instance. See below for details. 86 87 Returns: 88 A JSON string representing the record. 89 90 A LogRecord instance has the following attributes and is used for 91 formatting the final message. 92 93 Attributes: 94 created: A double representing the timestamp for record creation 95 (e.g., 1438365207.624597). Note that the number contains also msecs and 96 microsecs information. Part of this is also available in the 'msecs' 97 attribute. 98 msecs: A double representing the msecs part of the record creation 99 (e.g., 624.5970726013184). 100 msg: Logging message containing formatting instructions or an arbitrary 101 object. This is the first argument of a log call. 102 args: A tuple containing the positional arguments for the logging call. 103 levelname: A string. Possible values are: INFO, WARNING, ERROR, etc. 104 exc_info: None or a 3-tuple with exception information as it is 105 returned by a call to sys.exc_info(). 106 name: Logger's name. Most logging is done using the default root logger 107 and therefore the name will be 'root'. 108 filename: Basename of the file where logging occurred. 109 funcName: Name of the function where logging occurred. 110 process: The PID of the process running the worker. 111 thread: An id for the thread where the record was logged. This is not a 112 real TID (the one provided by OS) but rather the id (address) of a 113 Python thread object. Nevertheless having this value can allow to 114 filter log statement from only one specific thread. 115 """ 116 output = {} # type: Dict[str, Any] 117 output['timestamp'] = { 118 'seconds': int(record.created), 'nanos': int(record.msecs * 1000000) 119 } 120 # ERROR. INFO, DEBUG log levels translate into the same for severity 121 # property. WARNING becomes WARN. 122 output['severity'] = ( 123 record.levelname if record.levelname != 'WARNING' else 'WARN') 124 125 # msg could be an arbitrary object, convert it to a string first. 126 record_msg = str(record.msg) 127 128 # Prepare the actual message using the message formatting string and the 129 # positional arguments as they have been used in the log call. 130 if record.args: 131 try: 132 output['message'] = record_msg % record.args 133 except (TypeError, ValueError): 134 output['message'] = '%s with args (%s)' % (record_msg, record.args) 135 else: 136 output['message'] = record_msg 137 138 # The thread ID is logged as a combination of the process ID and thread ID 139 # since workers can run in multiple processes. 140 output['thread'] = '%s:%s' % (record.process, record.thread) 141 # job ID and worker ID. These do not change during the lifetime of a worker. 142 output['job'] = self.job_id 143 output['worker'] = self.worker_id 144 # Stage, step and work item ID come from thread local storage since they 145 # change with every new work item leased for execution. If there is no 146 # work item ID then we make sure the step is undefined too. 147 data = per_thread_worker_data.get_data() 148 if 'work_item_id' in data: 149 output['work'] = data['work_item_id'] 150 151 tracker = statesampler.get_current_tracker() 152 if tracker: 153 output['stage'] = tracker.stage_name 154 155 if tracker.current_state() and tracker.current_state().name_context: 156 output['step'] = tracker.current_state().name_context.logging_name() 157 158 # All logging happens using the root logger. We will add the basename of the 159 # file and the function name where the logging happened to make it easier 160 # to identify who generated the record. 161 output['logger'] = '%s:%s:%s' % ( 162 record.name, record.filename, record.funcName) 163 # Add exception information if any is available. 164 if record.exc_info: 165 output['exception'] = ''.join( 166 traceback.format_exception(*record.exc_info)) 167 168 return json.dumps(output) 169 170 171 def initialize(job_id, worker_id, log_path, log_level=logging.INFO): 172 # type: (str, str, str, int) -> None 173 174 """Initialize root logger so that we log JSON to a file and text to stdout.""" 175 176 file_handler = logging.FileHandler(log_path) 177 file_handler.setFormatter(JsonLogFormatter(job_id, worker_id)) 178 logging.getLogger().addHandler(file_handler) 179 180 # Default level is set to INFO to avoid logging various DEBUG level log calls 181 # sprinkled throughout the code. 182 logging.getLogger().setLevel(log_level)