github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/internal/dill_pickler.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Pickler for values, functions, and classes.
    19  
    20  For internal use only. No backwards compatibility guarantees.
    21  
    22  Pickles created by the pickling library contain non-ASCII characters, so
    23  we base64-encode the results so that we can put them in a JSON objects.
    24  The pickler is used to embed FlatMap callable objects into the workflow JSON
    25  description.
    26  
    27  The pickler module should be used to pickle functions and modules; for values,
    28  the coders.*PickleCoder classes should be used instead.
    29  """
    30  
    31  # pytype: skip-file
    32  
    33  import base64
    34  import bz2
    35  import logging
    36  import sys
    37  import threading
    38  import traceback
    39  import types
    40  import zlib
    41  from typing import Any
    42  from typing import Dict
    43  from typing import Tuple
    44  
    45  import dill
    46  
    47  settings = {'dill_byref': None}
    48  
    49  if sys.version_info >= (3, 11) and dill.__version__ == "0.3.1.1":
    50    # Let's make dill 0.3.1.1 support Python 3.11.
    51  
    52    # The following function is based on 'save_code' from 'dill'
    53    # Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
    54    # Copyright (c) 2008-2015 California Institute of Technology.
    55    # Copyright (c) 2016-2023 The Uncertainty Quantification Foundation.
    56    # License: 3-clause BSD.  The full license text is available at:
    57    #  - https://github.com/uqfoundation/dill/blob/master/LICENSE
    58  
    59    # The following function is also based on 'save_codeobject' from 'cloudpickle'
    60    # Copyright (c) 2012, Regents of the University of California.
    61    # Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
    62    # License: 3-clause BSD.  The full license text is available at:
    63    #  - https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE
    64  
    65    from types import CodeType
    66  
    67    @dill.register(CodeType)
    68    def save_code(pickler, obj):
    69      if hasattr(obj, "co_endlinetable"):  # python 3.11a (20 args)
    70        args = (
    71            obj.co_argcount,
    72            obj.co_posonlyargcount,
    73            obj.co_kwonlyargcount,
    74            obj.co_nlocals,
    75            obj.co_stacksize,
    76            obj.co_flags,
    77            obj.co_code,
    78            obj.co_consts,
    79            obj.co_names,
    80            obj.co_varnames,
    81            obj.co_filename,
    82            obj.co_name,
    83            obj.co_qualname,
    84            obj.co_firstlineno,
    85            obj.co_linetable,
    86            obj.co_endlinetable,
    87            obj.co_columntable,
    88            obj.co_exceptiontable,
    89            obj.co_freevars,
    90            obj.co_cellvars)
    91      elif hasattr(obj, "co_exceptiontable"):  # python 3.11 (18 args)
    92        args = (
    93            obj.co_argcount,
    94            obj.co_posonlyargcount,
    95            obj.co_kwonlyargcount,
    96            obj.co_nlocals,
    97            obj.co_stacksize,
    98            obj.co_flags,
    99            obj.co_code,
   100            obj.co_consts,
   101            obj.co_names,
   102            obj.co_varnames,
   103            obj.co_filename,
   104            obj.co_name,
   105            obj.co_qualname,
   106            obj.co_firstlineno,
   107            obj.co_linetable,
   108            obj.co_exceptiontable,
   109            obj.co_freevars,
   110            obj.co_cellvars)
   111      elif hasattr(obj, "co_linetable"):  # python 3.10 (16 args)
   112        args = (
   113            obj.co_argcount,
   114            obj.co_posonlyargcount,
   115            obj.co_kwonlyargcount,
   116            obj.co_nlocals,
   117            obj.co_stacksize,
   118            obj.co_flags,
   119            obj.co_code,
   120            obj.co_consts,
   121            obj.co_names,
   122            obj.co_varnames,
   123            obj.co_filename,
   124            obj.co_name,
   125            obj.co_firstlineno,
   126            obj.co_linetable,
   127            obj.co_freevars,
   128            obj.co_cellvars)
   129      elif hasattr(obj, "co_posonlyargcount"):  # python 3.8 (16 args)
   130        args = (
   131            obj.co_argcount,
   132            obj.co_posonlyargcount,
   133            obj.co_kwonlyargcount,
   134            obj.co_nlocals,
   135            obj.co_stacksize,
   136            obj.co_flags,
   137            obj.co_code,
   138            obj.co_consts,
   139            obj.co_names,
   140            obj.co_varnames,
   141            obj.co_filename,
   142            obj.co_name,
   143            obj.co_firstlineno,
   144            obj.co_lnotab,
   145            obj.co_freevars,
   146            obj.co_cellvars)
   147      else:  # python 3.7 (15 args)
   148        args = (
   149            obj.co_argcount,
   150            obj.co_kwonlyargcount,
   151            obj.co_nlocals,
   152            obj.co_stacksize,
   153            obj.co_flags,
   154            obj.co_code,
   155            obj.co_consts,
   156            obj.co_names,
   157            obj.co_varnames,
   158            obj.co_filename,
   159            obj.co_name,
   160            obj.co_firstlineno,
   161            obj.co_lnotab,
   162            obj.co_freevars,
   163            obj.co_cellvars)
   164      pickler.save_reduce(CodeType, args, obj=obj)
   165  
   166    dill._dill.save_code = save_code
   167  
   168  
   169  class _NoOpContextManager(object):
   170    def __enter__(self):
   171      pass
   172  
   173    def __exit__(self, *unused_exc_info):
   174      pass
   175  
   176  
   177  # Pickling, especially unpickling, causes broken module imports on Python 3
   178  # if executed concurrently, see: BEAM-8651, http://bugs.python.org/issue38884.
   179  _pickle_lock = threading.RLock()
   180  # Dill 0.28.0 renamed dill.dill to dill._dill:
   181  # https://github.com/uqfoundation/dill/commit/f0972ecc7a41d0b8acada6042d557068cac69baa
   182  # TODO: Remove this once Beam depends on dill >= 0.2.8
   183  if not getattr(dill, 'dill', None):
   184    dill.dill = dill._dill
   185    sys.modules['dill.dill'] = dill._dill
   186  
   187  # TODO: Remove once Dataflow has containers with a preinstalled dill >= 0.2.8
   188  if not getattr(dill, '_dill', None):
   189    dill._dill = dill.dill
   190    sys.modules['dill._dill'] = dill.dill
   191  
   192  dill_log = getattr(dill.dill, 'log', None)
   193  
   194  # dill v0.3.6 changed the attribute name from 'log' to 'logger'
   195  if not dill_log:
   196    dill_log = getattr(dill.dill, 'logger')
   197  
   198  
   199  def _is_nested_class(cls):
   200    """Returns true if argument is a class object that appears to be nested."""
   201    return (
   202        isinstance(cls, type) and cls.__module__ is not None and
   203        cls.__module__ != 'builtins' and
   204        cls.__name__ not in sys.modules[cls.__module__].__dict__)
   205  
   206  
   207  def _find_containing_class(nested_class):
   208    """Finds containing class of a nested class passed as argument."""
   209  
   210    seen = set()
   211  
   212    def _find_containing_class_inner(outer):
   213      if outer in seen:
   214        return None
   215      seen.add(outer)
   216      for k, v in outer.__dict__.items():
   217        if v is nested_class:
   218          return outer, k
   219        elif isinstance(v, type) and hasattr(v, '__dict__'):
   220          res = _find_containing_class_inner(v)
   221          if res: return res
   222  
   223    return _find_containing_class_inner(sys.modules[nested_class.__module__])
   224  
   225  
   226  def _dict_from_mappingproxy(mp):
   227    d = mp.copy()
   228    d.pop('__dict__', None)
   229    d.pop('__prepare__', None)
   230    d.pop('__weakref__', None)
   231    return d
   232  
   233  
   234  def _nested_type_wrapper(fun):
   235    """A wrapper for the standard pickler handler for class objects.
   236  
   237    Args:
   238      fun: Original pickler handler for type objects.
   239  
   240    Returns:
   241      A wrapper for type objects that handles nested classes.
   242  
   243    The wrapper detects if an object being pickled is a nested class object.
   244    For nested class object only it will save the containing class object so
   245    the nested structure is recreated during unpickle.
   246    """
   247    def wrapper(pickler, obj):
   248      # When the nested class is defined in the __main__ module we do not have to
   249      # do anything special because the pickler itself will save the constituent
   250      # parts of the type (i.e., name, base classes, dictionary) and then
   251      # recreate it during unpickling.
   252      if _is_nested_class(obj) and obj.__module__ != '__main__':
   253        containing_class_and_name = _find_containing_class(obj)
   254        if containing_class_and_name is not None:
   255          return pickler.save_reduce(getattr, containing_class_and_name, obj=obj)
   256      try:
   257        return fun(pickler, obj)
   258      except dill.dill.PicklingError:
   259        # pylint: disable=protected-access
   260        return pickler.save_reduce(
   261            dill.dill._create_type,
   262            (
   263                type(obj),
   264                obj.__name__,
   265                obj.__bases__,
   266                _dict_from_mappingproxy(obj.__dict__)),
   267            obj=obj)
   268        # pylint: enable=protected-access
   269  
   270    return wrapper
   271  
   272  
   273  # Monkey patch the standard pickler dispatch table entry for type objects.
   274  # Dill, for certain types, defers to the standard pickler (including type
   275  # objects). We wrap the standard handler using type_wrapper() because
   276  # for nested class we want to pickle the actual enclosing class object so we
   277  # can recreate it during unpickling.
   278  # TODO(silviuc): Make sure we submit the fix upstream to GitHub dill project.
   279  dill.dill.Pickler.dispatch[type] = _nested_type_wrapper(
   280      dill.dill.Pickler.dispatch[type])
   281  
   282  
   283  # Dill pickles generators objects without complaint, but unpickling produces
   284  # TypeError: object.__new__(generator) is not safe, use generator.__new__()
   285  # on some versions of Python.
   286  def _reject_generators(unused_pickler, unused_obj):
   287    raise TypeError("can't (safely) pickle generator objects")
   288  
   289  
   290  dill.dill.Pickler.dispatch[types.GeneratorType] = _reject_generators
   291  
   292  # This if guards against dill not being full initialized when generating docs.
   293  if 'save_module' in dir(dill.dill):
   294  
   295    # Always pickle non-main modules by name.
   296    old_save_module = dill.dill.save_module
   297  
   298    @dill.dill.register(dill.dill.ModuleType)
   299    def save_module(pickler, obj):
   300      if dill.dill.is_dill(pickler) and obj is pickler._main:
   301        return old_save_module(pickler, obj)
   302      else:
   303        dill_log.info('M2: %s' % obj)
   304        # pylint: disable=protected-access
   305        pickler.save_reduce(dill.dill._import_module, (obj.__name__, ), obj=obj)
   306        # pylint: enable=protected-access
   307        dill_log.info('# M2')
   308  
   309    # Pickle module dictionaries (commonly found in lambda's globals)
   310    # by referencing their module.
   311    old_save_module_dict = dill.dill.save_module_dict
   312    known_module_dicts = {
   313    }  # type: Dict[int, Tuple[types.ModuleType, Dict[str, Any]]]
   314  
   315    @dill.dill.register(dict)
   316    def new_save_module_dict(pickler, obj):
   317      obj_id = id(obj)
   318      if not known_module_dicts or '__file__' in obj or '__package__' in obj:
   319        if obj_id not in known_module_dicts:
   320          # Trigger loading of lazily loaded modules (such as pytest vendored
   321          # modules).
   322          # This pass over sys.modules needs to iterate on a copy of sys.modules
   323          # since lazy loading modifies the dictionary, hence the use of list().
   324          for m in list(sys.modules.values()):
   325            try:
   326              _ = m.__dict__
   327            except AttributeError:
   328              pass
   329  
   330          for m in list(sys.modules.values()):
   331            try:
   332              if (m and m.__name__ != '__main__' and
   333                  isinstance(m, dill.dill.ModuleType)):
   334                d = m.__dict__
   335                known_module_dicts[id(d)] = m, d
   336            except AttributeError:
   337              # Skip modules that do not have the __name__ attribute.
   338              pass
   339      if obj_id in known_module_dicts and dill.dill.is_dill(pickler):
   340        m = known_module_dicts[obj_id][0]
   341        try:
   342          # pylint: disable=protected-access
   343          dill.dill._import_module(m.__name__)
   344          return pickler.save_reduce(
   345              getattr, (known_module_dicts[obj_id][0], '__dict__'), obj=obj)
   346        except (ImportError, AttributeError):
   347          return old_save_module_dict(pickler, obj)
   348      else:
   349        return old_save_module_dict(pickler, obj)
   350  
   351    dill.dill.save_module_dict = new_save_module_dict
   352  
   353    def _nest_dill_logging():
   354      """Prefix all dill logging with its depth in the callstack.
   355  
   356      Useful for debugging pickling of deeply nested structures.
   357      """
   358      old_log_info = dill_log.info
   359  
   360      def new_log_info(msg, *args, **kwargs):
   361        old_log_info(
   362            ('1 2 3 4 5 6 7 8 9 0 ' * 10)[:len(traceback.extract_stack())] + msg,
   363            *args,
   364            **kwargs)
   365  
   366      dill_log.info = new_log_info
   367  
   368  
   369  # Turn off verbose logging from the dill pickler.
   370  logging.getLogger('dill').setLevel(logging.WARN)
   371  
   372  
   373  def dumps(o, enable_trace=True, use_zlib=False):
   374    # type: (...) -> bytes
   375  
   376    """For internal use only; no backwards-compatibility guarantees."""
   377    with _pickle_lock:
   378      try:
   379        s = dill.dumps(o, byref=settings['dill_byref'])
   380      except Exception:  # pylint: disable=broad-except
   381        if enable_trace:
   382          dill.dill._trace(True)  # pylint: disable=protected-access
   383          s = dill.dumps(o, byref=settings['dill_byref'])
   384        else:
   385          raise
   386      finally:
   387        dill.dill._trace(False)  # pylint: disable=protected-access
   388  
   389    # Compress as compactly as possible (compresslevel=9) to decrease peak memory
   390    # usage (of multiple in-memory copies) and to avoid hitting protocol buffer
   391    # limits.
   392    # WARNING: Be cautious about compressor change since it can lead to pipeline
   393    # representation change, and can break streaming job update compatibility on
   394    # runners such as Dataflow.
   395    if use_zlib:
   396      c = zlib.compress(s, 9)
   397    else:
   398      c = bz2.compress(s, compresslevel=9)
   399    del s  # Free up some possibly large and no-longer-needed memory.
   400  
   401    return base64.b64encode(c)
   402  
   403  
   404  def loads(encoded, enable_trace=True, use_zlib=False):
   405    """For internal use only; no backwards-compatibility guarantees."""
   406  
   407    c = base64.b64decode(encoded)
   408  
   409    if use_zlib:
   410      s = zlib.decompress(c)
   411    else:
   412      s = bz2.decompress(c)
   413  
   414    del c  # Free up some possibly large and no-longer-needed memory.
   415  
   416    with _pickle_lock:
   417      try:
   418        return dill.loads(s)
   419      except Exception:  # pylint: disable=broad-except
   420        if enable_trace:
   421          dill.dill._trace(True)  # pylint: disable=protected-access
   422          return dill.loads(s)
   423        else:
   424          raise
   425      finally:
   426        dill.dill._trace(False)  # pylint: disable=protected-access
   427  
   428  
   429  def dump_session(file_path):
   430    """For internal use only; no backwards-compatibility guarantees.
   431  
   432    Pickle the current python session to be used in the worker.
   433  
   434    Note: Due to the inconsistency in the first dump of dill dump_session we
   435    create and load the dump twice to have consistent results in the worker and
   436    the running session. Check: https://github.com/uqfoundation/dill/issues/195
   437    """
   438    with _pickle_lock:
   439      dill.dump_session(file_path)
   440      dill.load_session(file_path)
   441      return dill.dump_session(file_path)
   442  
   443  
   444  def load_session(file_path):
   445    with _pickle_lock:
   446      return dill.load_session(file_path)
   447  
   448  
   449  def override_pickler_hooks(extend=True):
   450    """ Extends the dill library hooks into that of the standard pickler library.
   451  
   452    If false all hooks that dill overrides will be removed.
   453    If true dill hooks will be injected into the pickler library dispatch_table.
   454    """
   455    dill.extend(extend)