github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/internal/dill_pickler.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Pickler for values, functions, and classes. 19 20 For internal use only. No backwards compatibility guarantees. 21 22 Pickles created by the pickling library contain non-ASCII characters, so 23 we base64-encode the results so that we can put them in a JSON objects. 24 The pickler is used to embed FlatMap callable objects into the workflow JSON 25 description. 26 27 The pickler module should be used to pickle functions and modules; for values, 28 the coders.*PickleCoder classes should be used instead. 29 """ 30 31 # pytype: skip-file 32 33 import base64 34 import bz2 35 import logging 36 import sys 37 import threading 38 import traceback 39 import types 40 import zlib 41 from typing import Any 42 from typing import Dict 43 from typing import Tuple 44 45 import dill 46 47 settings = {'dill_byref': None} 48 49 if sys.version_info >= (3, 11) and dill.__version__ == "0.3.1.1": 50 # Let's make dill 0.3.1.1 support Python 3.11. 51 52 # The following function is based on 'save_code' from 'dill' 53 # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) 54 # Copyright (c) 2008-2015 California Institute of Technology. 55 # Copyright (c) 2016-2023 The Uncertainty Quantification Foundation. 56 # License: 3-clause BSD. The full license text is available at: 57 # - https://github.com/uqfoundation/dill/blob/master/LICENSE 58 59 # The following function is also based on 'save_codeobject' from 'cloudpickle' 60 # Copyright (c) 2012, Regents of the University of California. 61 # Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_. 62 # License: 3-clause BSD. The full license text is available at: 63 # - https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE 64 65 from types import CodeType 66 67 @dill.register(CodeType) 68 def save_code(pickler, obj): 69 if hasattr(obj, "co_endlinetable"): # python 3.11a (20 args) 70 args = ( 71 obj.co_argcount, 72 obj.co_posonlyargcount, 73 obj.co_kwonlyargcount, 74 obj.co_nlocals, 75 obj.co_stacksize, 76 obj.co_flags, 77 obj.co_code, 78 obj.co_consts, 79 obj.co_names, 80 obj.co_varnames, 81 obj.co_filename, 82 obj.co_name, 83 obj.co_qualname, 84 obj.co_firstlineno, 85 obj.co_linetable, 86 obj.co_endlinetable, 87 obj.co_columntable, 88 obj.co_exceptiontable, 89 obj.co_freevars, 90 obj.co_cellvars) 91 elif hasattr(obj, "co_exceptiontable"): # python 3.11 (18 args) 92 args = ( 93 obj.co_argcount, 94 obj.co_posonlyargcount, 95 obj.co_kwonlyargcount, 96 obj.co_nlocals, 97 obj.co_stacksize, 98 obj.co_flags, 99 obj.co_code, 100 obj.co_consts, 101 obj.co_names, 102 obj.co_varnames, 103 obj.co_filename, 104 obj.co_name, 105 obj.co_qualname, 106 obj.co_firstlineno, 107 obj.co_linetable, 108 obj.co_exceptiontable, 109 obj.co_freevars, 110 obj.co_cellvars) 111 elif hasattr(obj, "co_linetable"): # python 3.10 (16 args) 112 args = ( 113 obj.co_argcount, 114 obj.co_posonlyargcount, 115 obj.co_kwonlyargcount, 116 obj.co_nlocals, 117 obj.co_stacksize, 118 obj.co_flags, 119 obj.co_code, 120 obj.co_consts, 121 obj.co_names, 122 obj.co_varnames, 123 obj.co_filename, 124 obj.co_name, 125 obj.co_firstlineno, 126 obj.co_linetable, 127 obj.co_freevars, 128 obj.co_cellvars) 129 elif hasattr(obj, "co_posonlyargcount"): # python 3.8 (16 args) 130 args = ( 131 obj.co_argcount, 132 obj.co_posonlyargcount, 133 obj.co_kwonlyargcount, 134 obj.co_nlocals, 135 obj.co_stacksize, 136 obj.co_flags, 137 obj.co_code, 138 obj.co_consts, 139 obj.co_names, 140 obj.co_varnames, 141 obj.co_filename, 142 obj.co_name, 143 obj.co_firstlineno, 144 obj.co_lnotab, 145 obj.co_freevars, 146 obj.co_cellvars) 147 else: # python 3.7 (15 args) 148 args = ( 149 obj.co_argcount, 150 obj.co_kwonlyargcount, 151 obj.co_nlocals, 152 obj.co_stacksize, 153 obj.co_flags, 154 obj.co_code, 155 obj.co_consts, 156 obj.co_names, 157 obj.co_varnames, 158 obj.co_filename, 159 obj.co_name, 160 obj.co_firstlineno, 161 obj.co_lnotab, 162 obj.co_freevars, 163 obj.co_cellvars) 164 pickler.save_reduce(CodeType, args, obj=obj) 165 166 dill._dill.save_code = save_code 167 168 169 class _NoOpContextManager(object): 170 def __enter__(self): 171 pass 172 173 def __exit__(self, *unused_exc_info): 174 pass 175 176 177 # Pickling, especially unpickling, causes broken module imports on Python 3 178 # if executed concurrently, see: BEAM-8651, http://bugs.python.org/issue38884. 179 _pickle_lock = threading.RLock() 180 # Dill 0.28.0 renamed dill.dill to dill._dill: 181 # https://github.com/uqfoundation/dill/commit/f0972ecc7a41d0b8acada6042d557068cac69baa 182 # TODO: Remove this once Beam depends on dill >= 0.2.8 183 if not getattr(dill, 'dill', None): 184 dill.dill = dill._dill 185 sys.modules['dill.dill'] = dill._dill 186 187 # TODO: Remove once Dataflow has containers with a preinstalled dill >= 0.2.8 188 if not getattr(dill, '_dill', None): 189 dill._dill = dill.dill 190 sys.modules['dill._dill'] = dill.dill 191 192 dill_log = getattr(dill.dill, 'log', None) 193 194 # dill v0.3.6 changed the attribute name from 'log' to 'logger' 195 if not dill_log: 196 dill_log = getattr(dill.dill, 'logger') 197 198 199 def _is_nested_class(cls): 200 """Returns true if argument is a class object that appears to be nested.""" 201 return ( 202 isinstance(cls, type) and cls.__module__ is not None and 203 cls.__module__ != 'builtins' and 204 cls.__name__ not in sys.modules[cls.__module__].__dict__) 205 206 207 def _find_containing_class(nested_class): 208 """Finds containing class of a nested class passed as argument.""" 209 210 seen = set() 211 212 def _find_containing_class_inner(outer): 213 if outer in seen: 214 return None 215 seen.add(outer) 216 for k, v in outer.__dict__.items(): 217 if v is nested_class: 218 return outer, k 219 elif isinstance(v, type) and hasattr(v, '__dict__'): 220 res = _find_containing_class_inner(v) 221 if res: return res 222 223 return _find_containing_class_inner(sys.modules[nested_class.__module__]) 224 225 226 def _dict_from_mappingproxy(mp): 227 d = mp.copy() 228 d.pop('__dict__', None) 229 d.pop('__prepare__', None) 230 d.pop('__weakref__', None) 231 return d 232 233 234 def _nested_type_wrapper(fun): 235 """A wrapper for the standard pickler handler for class objects. 236 237 Args: 238 fun: Original pickler handler for type objects. 239 240 Returns: 241 A wrapper for type objects that handles nested classes. 242 243 The wrapper detects if an object being pickled is a nested class object. 244 For nested class object only it will save the containing class object so 245 the nested structure is recreated during unpickle. 246 """ 247 def wrapper(pickler, obj): 248 # When the nested class is defined in the __main__ module we do not have to 249 # do anything special because the pickler itself will save the constituent 250 # parts of the type (i.e., name, base classes, dictionary) and then 251 # recreate it during unpickling. 252 if _is_nested_class(obj) and obj.__module__ != '__main__': 253 containing_class_and_name = _find_containing_class(obj) 254 if containing_class_and_name is not None: 255 return pickler.save_reduce(getattr, containing_class_and_name, obj=obj) 256 try: 257 return fun(pickler, obj) 258 except dill.dill.PicklingError: 259 # pylint: disable=protected-access 260 return pickler.save_reduce( 261 dill.dill._create_type, 262 ( 263 type(obj), 264 obj.__name__, 265 obj.__bases__, 266 _dict_from_mappingproxy(obj.__dict__)), 267 obj=obj) 268 # pylint: enable=protected-access 269 270 return wrapper 271 272 273 # Monkey patch the standard pickler dispatch table entry for type objects. 274 # Dill, for certain types, defers to the standard pickler (including type 275 # objects). We wrap the standard handler using type_wrapper() because 276 # for nested class we want to pickle the actual enclosing class object so we 277 # can recreate it during unpickling. 278 # TODO(silviuc): Make sure we submit the fix upstream to GitHub dill project. 279 dill.dill.Pickler.dispatch[type] = _nested_type_wrapper( 280 dill.dill.Pickler.dispatch[type]) 281 282 283 # Dill pickles generators objects without complaint, but unpickling produces 284 # TypeError: object.__new__(generator) is not safe, use generator.__new__() 285 # on some versions of Python. 286 def _reject_generators(unused_pickler, unused_obj): 287 raise TypeError("can't (safely) pickle generator objects") 288 289 290 dill.dill.Pickler.dispatch[types.GeneratorType] = _reject_generators 291 292 # This if guards against dill not being full initialized when generating docs. 293 if 'save_module' in dir(dill.dill): 294 295 # Always pickle non-main modules by name. 296 old_save_module = dill.dill.save_module 297 298 @dill.dill.register(dill.dill.ModuleType) 299 def save_module(pickler, obj): 300 if dill.dill.is_dill(pickler) and obj is pickler._main: 301 return old_save_module(pickler, obj) 302 else: 303 dill_log.info('M2: %s' % obj) 304 # pylint: disable=protected-access 305 pickler.save_reduce(dill.dill._import_module, (obj.__name__, ), obj=obj) 306 # pylint: enable=protected-access 307 dill_log.info('# M2') 308 309 # Pickle module dictionaries (commonly found in lambda's globals) 310 # by referencing their module. 311 old_save_module_dict = dill.dill.save_module_dict 312 known_module_dicts = { 313 } # type: Dict[int, Tuple[types.ModuleType, Dict[str, Any]]] 314 315 @dill.dill.register(dict) 316 def new_save_module_dict(pickler, obj): 317 obj_id = id(obj) 318 if not known_module_dicts or '__file__' in obj or '__package__' in obj: 319 if obj_id not in known_module_dicts: 320 # Trigger loading of lazily loaded modules (such as pytest vendored 321 # modules). 322 # This pass over sys.modules needs to iterate on a copy of sys.modules 323 # since lazy loading modifies the dictionary, hence the use of list(). 324 for m in list(sys.modules.values()): 325 try: 326 _ = m.__dict__ 327 except AttributeError: 328 pass 329 330 for m in list(sys.modules.values()): 331 try: 332 if (m and m.__name__ != '__main__' and 333 isinstance(m, dill.dill.ModuleType)): 334 d = m.__dict__ 335 known_module_dicts[id(d)] = m, d 336 except AttributeError: 337 # Skip modules that do not have the __name__ attribute. 338 pass 339 if obj_id in known_module_dicts and dill.dill.is_dill(pickler): 340 m = known_module_dicts[obj_id][0] 341 try: 342 # pylint: disable=protected-access 343 dill.dill._import_module(m.__name__) 344 return pickler.save_reduce( 345 getattr, (known_module_dicts[obj_id][0], '__dict__'), obj=obj) 346 except (ImportError, AttributeError): 347 return old_save_module_dict(pickler, obj) 348 else: 349 return old_save_module_dict(pickler, obj) 350 351 dill.dill.save_module_dict = new_save_module_dict 352 353 def _nest_dill_logging(): 354 """Prefix all dill logging with its depth in the callstack. 355 356 Useful for debugging pickling of deeply nested structures. 357 """ 358 old_log_info = dill_log.info 359 360 def new_log_info(msg, *args, **kwargs): 361 old_log_info( 362 ('1 2 3 4 5 6 7 8 9 0 ' * 10)[:len(traceback.extract_stack())] + msg, 363 *args, 364 **kwargs) 365 366 dill_log.info = new_log_info 367 368 369 # Turn off verbose logging from the dill pickler. 370 logging.getLogger('dill').setLevel(logging.WARN) 371 372 373 def dumps(o, enable_trace=True, use_zlib=False): 374 # type: (...) -> bytes 375 376 """For internal use only; no backwards-compatibility guarantees.""" 377 with _pickle_lock: 378 try: 379 s = dill.dumps(o, byref=settings['dill_byref']) 380 except Exception: # pylint: disable=broad-except 381 if enable_trace: 382 dill.dill._trace(True) # pylint: disable=protected-access 383 s = dill.dumps(o, byref=settings['dill_byref']) 384 else: 385 raise 386 finally: 387 dill.dill._trace(False) # pylint: disable=protected-access 388 389 # Compress as compactly as possible (compresslevel=9) to decrease peak memory 390 # usage (of multiple in-memory copies) and to avoid hitting protocol buffer 391 # limits. 392 # WARNING: Be cautious about compressor change since it can lead to pipeline 393 # representation change, and can break streaming job update compatibility on 394 # runners such as Dataflow. 395 if use_zlib: 396 c = zlib.compress(s, 9) 397 else: 398 c = bz2.compress(s, compresslevel=9) 399 del s # Free up some possibly large and no-longer-needed memory. 400 401 return base64.b64encode(c) 402 403 404 def loads(encoded, enable_trace=True, use_zlib=False): 405 """For internal use only; no backwards-compatibility guarantees.""" 406 407 c = base64.b64decode(encoded) 408 409 if use_zlib: 410 s = zlib.decompress(c) 411 else: 412 s = bz2.decompress(c) 413 414 del c # Free up some possibly large and no-longer-needed memory. 415 416 with _pickle_lock: 417 try: 418 return dill.loads(s) 419 except Exception: # pylint: disable=broad-except 420 if enable_trace: 421 dill.dill._trace(True) # pylint: disable=protected-access 422 return dill.loads(s) 423 else: 424 raise 425 finally: 426 dill.dill._trace(False) # pylint: disable=protected-access 427 428 429 def dump_session(file_path): 430 """For internal use only; no backwards-compatibility guarantees. 431 432 Pickle the current python session to be used in the worker. 433 434 Note: Due to the inconsistency in the first dump of dill dump_session we 435 create and load the dump twice to have consistent results in the worker and 436 the running session. Check: https://github.com/uqfoundation/dill/issues/195 437 """ 438 with _pickle_lock: 439 dill.dump_session(file_path) 440 dill.load_session(file_path) 441 return dill.dump_session(file_path) 442 443 444 def load_session(file_path): 445 with _pickle_lock: 446 return dill.load_session(file_path) 447 448 449 def override_pickler_hooks(extend=True): 450 """ Extends the dill library hooks into that of the standard pickler library. 451 452 If false all hooks that dill overrides will be removed. 453 If true dill hooks will be injected into the pickler library dispatch_table. 454 """ 455 dill.extend(extend)