github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/multi_process_shared.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Implements a shared object that spans processes.
    19  
    20  This object will be instanciated once per VM and methods will be invoked
    21  on it via rpc.
    22  """
    23  # pytype: skip-file
    24  
    25  import logging
    26  import multiprocessing.managers
    27  import os
    28  import tempfile
    29  import threading
    30  from typing import Any
    31  from typing import Callable
    32  from typing import Dict
    33  from typing import Generic
    34  from typing import Optional
    35  from typing import TypeVar
    36  
    37  import fasteners
    38  
    39  T = TypeVar('T')
    40  AUTH_KEY = b'mps'
    41  
    42  
    43  class _SingletonProxy:
    44    """Proxies the shared object so we can release it with better errors and no
    45    risk of dangling references in the multiprocessing manager infrastructure.
    46    """
    47    def __init__(self, entry):
    48      # Guard names so as to not conflict with names of underlying object.
    49      self._SingletonProxy_entry = entry
    50      self._SingletonProxy_valid = True
    51  
    52    # Used to make the shared object callable (see _AutoProxyWrapper below)
    53    def singletonProxy_call__(self, *args, **kwargs):
    54      if not self._SingletonProxy_valid:
    55        raise RuntimeError('Entry was released.')
    56      return self._SingletonProxy_entry.obj.__call__(*args, **kwargs)
    57  
    58    def _SingletonProxy_release(self):
    59      assert self._SingletonProxy_valid
    60      self._SingletonProxy_valid = False
    61  
    62    def __getattr__(self, name):
    63      if not self._SingletonProxy_valid:
    64        raise RuntimeError('Entry was released.')
    65      return getattr(self._SingletonProxy_entry.obj, name)
    66  
    67    def __dir__(self):
    68      # Needed for multiprocessing.managers's proxying.
    69      dir = self._SingletonProxy_entry.obj.__dir__()
    70      dir.append('singletonProxy_call__')
    71      return dir
    72  
    73  
    74  class _SingletonEntry:
    75    """Represents a single, refcounted entry in this process."""
    76    def __init__(self, constructor, initialize_eagerly=True):
    77      self.constructor = constructor
    78      self.refcount = 0
    79      self.lock = threading.Lock()
    80      if initialize_eagerly:
    81        self.obj = constructor()
    82        self.initialied = True
    83      else:
    84        self.initialied = False
    85  
    86    def acquire(self):
    87      with self.lock:
    88        if not self.initialied:
    89          self.obj = self.constructor()
    90          self.initialied = True
    91        self.refcount += 1
    92        return _SingletonProxy(self)
    93  
    94    def release(self, proxy):
    95      proxy._SingletonProxy_release()
    96      with self.lock:
    97        self.refcount -= 1
    98        if self.refcount == 0:
    99          del self.obj
   100          self.initialied = False
   101  
   102  
   103  class _SingletonManager:
   104    entries: Dict[Any, Any] = {}
   105  
   106    def register_singleton(self, constructor, tag, initialize_eagerly=True):
   107      assert tag not in self.entries, tag
   108      self.entries[tag] = _SingletonEntry(constructor, initialize_eagerly)
   109  
   110    def has_singleton(self, tag):
   111      return tag in self.entries
   112  
   113    def acquire_singleton(self, tag):
   114      return self.entries[tag].acquire()
   115  
   116    def release_singleton(self, tag, obj):
   117      return self.entries[tag].release(obj)
   118  
   119  
   120  _process_level_singleton_manager = _SingletonManager()
   121  
   122  _process_local_lock = threading.Lock()
   123  
   124  
   125  class _SingletonRegistrar(multiprocessing.managers.BaseManager):
   126    pass
   127  
   128  
   129  _SingletonRegistrar.register(
   130      'acquire_singleton',
   131      callable=_process_level_singleton_manager.acquire_singleton)
   132  _SingletonRegistrar.register(
   133      'release_singleton',
   134      callable=_process_level_singleton_manager.release_singleton)
   135  
   136  
   137  # By default, objects registered with BaseManager.register will have only
   138  # public methods available (excluding __call__). If you know the functions
   139  # you would like to expose, you can do so at register time with the `exposed`
   140  # attribute. Since we don't, we will add a wrapper around the returned AutoProxy
   141  # object to handle __call__ function calls and turn them into
   142  # singletonProxy_call__ calls (which is a wrapper around the underlying
   143  # object's __call__ function)
   144  class _AutoProxyWrapper:
   145    def __init__(self, proxyObject: multiprocessing.managers.BaseProxy):
   146      self._proxyObject = proxyObject
   147  
   148    def __call__(self, *args, **kwargs):
   149      return self._proxyObject.singletonProxy_call__(*args, **kwargs)
   150  
   151    def __getattr__(self, name):
   152      return getattr(self._proxyObject, name)
   153  
   154  
   155  class MultiProcessShared(Generic[T]):
   156    """MultiProcessShared is used to share a single object across processes.
   157  
   158    For example, one could have the class::
   159  
   160      class MyExpensiveObject(object):
   161        def __init__(self, args):
   162          [expensive initialization and memory allocation]
   163  
   164        def method(self, arg):
   165          ...
   166  
   167    One could share a single instance of this class by wrapping it as::
   168  
   169      shared_ptr = MultiProcessShared(lambda: MyExpensiveObject(...))
   170      my_expensive_object = shared_ptr.acquire()
   171  
   172    which could then be invoked as::
   173  
   174      my_expensive_object.method(arg)
   175  
   176    This can then be released with::
   177  
   178      shared_ptr.release(my_expensive_object)
   179  
   180    but care should be taken to avoid releasing the object too soon or
   181    expensive re-initialization may be required, defeating the point of
   182    using a shared object.
   183  
   184  
   185    Args:
   186      constructor: function that initialises / constructs the object if not
   187        present in the cache. This function should take no arguments. It should
   188        return an initialised object, or raise an exception if the object could
   189        not be initialised / constructed.
   190      tag: an indentifier to store with the cached object. If multiple
   191        MultiProcessShared instances are created with the same tag, they will all
   192        share the same proxied object.
   193      path: a temporary path in which to create the inter-process lock
   194      always_proxy: whether to direct all calls through the proxy, rather than
   195        call the object directly for the process that created it
   196    """
   197    def __init__(
   198        self,
   199        constructor: Callable[[], T],
   200        tag: Any,
   201        *,
   202        path: str = tempfile.gettempdir(),
   203        always_proxy: Optional[bool] = None):
   204      self._constructor = constructor
   205      self._tag = tag
   206      self._path = path
   207      self._always_proxy = False if always_proxy is None else always_proxy
   208      self._proxy = None
   209      self._manager = None
   210      self._rpc_address = None
   211      self._cross_process_lock = fasteners.InterProcessLock(
   212          os.path.join(self._path, self._tag) + '.lock')
   213  
   214    def _get_manager(self):
   215      if self._manager is None:
   216        address_file = os.path.join(self._path, self._tag) + ".address"
   217        while self._manager is None:
   218          with _process_local_lock:
   219            with self._cross_process_lock:
   220              if not os.path.exists(address_file):
   221                self._create_server(address_file)
   222  
   223              if _process_level_singleton_manager.has_singleton(
   224                  self._tag) and not self._always_proxy:
   225                self._manager = _process_level_singleton_manager
   226              else:
   227                with open(address_file) as fin:
   228                  address = fin.read()
   229                logging.info('Connecting to remote proxy at %s', address)
   230                host, port = address.split(':')
   231                # We need to be able to authenticate with both the manager and
   232                # the process.
   233                manager = _SingletonRegistrar(
   234                    address=(host, int(port)), authkey=AUTH_KEY)
   235                multiprocessing.current_process().authkey = AUTH_KEY
   236                try:
   237                  manager.connect()
   238                  self._manager = manager
   239                except ConnectionError:
   240                  # The server is no longer good, assume it died.
   241                  os.unlink(address_file)
   242  
   243      return self._manager
   244  
   245    def acquire(self):
   246      # TODO: Allow passing/parameterizing the callable here, in case they are
   247      # not available at MultiProcessShared construction time (e.g. from side
   248      # inputs)
   249      # Caveat: They must always agree, as they will be ignored if the object
   250      # is already constructed.
   251      singleton = self._get_manager().acquire_singleton(self._tag)
   252      return _AutoProxyWrapper(singleton)
   253  
   254    def release(self, obj):
   255      self._manager.release_singleton(self._tag, obj)
   256  
   257    def _create_server(self, address_file):
   258      # We need to be able to authenticate with both the manager and the process.
   259      self._serving_manager = _SingletonRegistrar(
   260          address=('localhost', 0), authkey=AUTH_KEY)
   261      multiprocessing.current_process().authkey = AUTH_KEY
   262      # Initialize eagerly to avoid acting as the server if there are issues.
   263      # Note, however, that _create_server itself is called lazily.
   264      _process_level_singleton_manager.register_singleton(
   265          self._constructor, self._tag, initialize_eagerly=True)
   266      self._server = self._serving_manager.get_server()
   267      logging.info(
   268          'Starting proxy server at %s for shared %s',
   269          self._server.address,
   270          self._tag)
   271      with open(address_file + '.tmp', 'w') as fout:
   272        fout.write('%s:%d' % self._server.address)
   273      os.rename(address_file + '.tmp', address_file)
   274      t = threading.Thread(target=self._server.serve_forever, daemon=True)
   275      t.start()
   276      logging.info('Done starting server')