github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/runner.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """PipelineRunner, an abstract base runner object.""" 19 20 # pytype: skip-file 21 22 import importlib 23 import logging 24 import os 25 import shelve 26 import shutil 27 import tempfile 28 from typing import TYPE_CHECKING 29 from typing import Optional 30 31 from apache_beam.options.pipeline_options import StandardOptions 32 33 if TYPE_CHECKING: 34 from apache_beam import pvalue 35 from apache_beam import PTransform 36 from apache_beam.options.pipeline_options import PipelineOptions 37 from apache_beam.pipeline import AppliedPTransform 38 from apache_beam.pipeline import Pipeline 39 from apache_beam.pipeline import PipelineVisitor 40 41 __all__ = ['PipelineRunner', 'PipelineState', 'PipelineResult'] 42 43 _RUNNER_MAP = { 44 path.rsplit('.', maxsplit=1)[-1].lower(): path 45 for path in StandardOptions.ALL_KNOWN_RUNNERS 46 } 47 48 # Allow this alias, but don't make public. 49 _RUNNER_MAP['pythonrpcdirectrunner'] = ( 50 'apache_beam.runners.experimental' 51 '.python_rpc_direct.python_rpc_direct_runner.PythonRPCDirectRunner') 52 53 _LOGGER = logging.getLogger(__name__) 54 55 56 def create_runner(runner_name): 57 # type: (str) -> PipelineRunner 58 59 """For internal use only; no backwards-compatibility guarantees. 60 61 Creates a runner instance from a runner class name. 62 63 Args: 64 runner_name: Name of the pipeline runner. Possible values are listed in 65 _RUNNER_MAP above. 66 67 Returns: 68 A runner object. 69 70 Raises: 71 RuntimeError: if an invalid runner name is used. 72 """ 73 74 # Get the qualified runner name by using the lower case runner name. If that 75 # fails try appending the name with 'runner' and check if it matches. 76 # If that also fails, use the given runner name as is. 77 runner_name = _RUNNER_MAP.get( 78 runner_name.lower(), 79 _RUNNER_MAP.get(runner_name.lower() + 'runner', runner_name)) 80 81 if '.' in runner_name: 82 module, runner = runner_name.rsplit('.', 1) 83 try: 84 return getattr(importlib.import_module(module), runner)() 85 except ImportError: 86 if 'dataflow' in runner_name.lower(): 87 raise ImportError( 88 'Google Cloud Dataflow runner not available, ' 89 'please install apache_beam[gcp]') 90 elif 'interactive' in runner_name.lower(): 91 raise ImportError( 92 'Interactive runner not available, ' 93 'please install apache_beam[interactive]') 94 else: 95 raise 96 else: 97 raise ValueError( 98 'Unexpected pipeline runner: %s. Valid values are %s ' 99 'or the fully qualified name of a PipelineRunner subclass.' % 100 (runner_name, ', '.join(StandardOptions.KNOWN_RUNNER_NAMES))) 101 102 103 class PipelineRunner(object): 104 """A runner of a pipeline object. 105 106 The base runner provides a run() method for visiting every node in the 107 pipeline's DAG and executing the transforms computing the PValue in the node. 108 109 A custom runner will typically provide implementations for some of the 110 transform methods (ParDo, GroupByKey, Create, etc.). It may also 111 provide a new implementation for clear_pvalue(), which is used to wipe out 112 materialized values in order to reduce footprint. 113 """ 114 115 def run(self, 116 transform, # type: PTransform 117 options=None # type: Optional[PipelineOptions] 118 ): 119 # type: (...) -> PipelineResult 120 121 """Run the given transform or callable with this runner. 122 123 Blocks until the pipeline is complete. See also `PipelineRunner.run_async`. 124 """ 125 result = self.run_async(transform, options) 126 result.wait_until_finish() 127 return result 128 129 def run_async(self, 130 transform, # type: PTransform 131 options=None # type: Optional[PipelineOptions] 132 ): 133 # type: (...) -> PipelineResult 134 135 """Run the given transform or callable with this runner. 136 137 May return immediately, executing the pipeline in the background. 138 The returned result object can be queried for progress, and 139 `wait_until_finish` may be called to block until completion. 140 """ 141 # Imported here to avoid circular dependencies. 142 # pylint: disable=wrong-import-order, wrong-import-position 143 from apache_beam import PTransform 144 from apache_beam.pvalue import PBegin 145 from apache_beam.pipeline import Pipeline 146 p = Pipeline(runner=self, options=options) 147 if isinstance(transform, PTransform): 148 p | transform 149 else: 150 transform(PBegin(p)) 151 return p.run() 152 153 def run_pipeline( 154 self, 155 pipeline, # type: Pipeline 156 options # type: PipelineOptions 157 ): 158 # type: (...) -> PipelineResult 159 160 """Execute the entire pipeline or the sub-DAG reachable from a node. 161 162 Runners should override this method. 163 """ 164 raise NotImplementedError 165 166 def apply(self, 167 transform, # type: PTransform 168 input, # type: Optional[pvalue.PValue] 169 options # type: PipelineOptions 170 ): 171 """Runner callback for a pipeline.apply call. 172 173 Args: 174 transform: the transform to apply. 175 input: transform's input (typically a PCollection). 176 177 A concrete implementation of the Runner class may want to do custom 178 pipeline construction for a given transform. To override the behavior 179 for a transform class Xyz, implement an apply_Xyz method with this same 180 signature. 181 """ 182 for cls in transform.__class__.mro(): 183 m = getattr(self, 'apply_%s' % cls.__name__, None) 184 if m: 185 return m(transform, input, options) 186 raise NotImplementedError( 187 'Execution of [%s] not implemented in runner %s.' % (transform, self)) 188 189 def visit_transforms( 190 self, 191 pipeline, # type: Pipeline 192 options # type: PipelineOptions 193 ): 194 # type: (...) -> None 195 # Imported here to avoid circular dependencies. 196 # pylint: disable=wrong-import-order, wrong-import-position 197 from apache_beam.pipeline import PipelineVisitor 198 199 class RunVisitor(PipelineVisitor): 200 def __init__(self, runner): 201 # type: (PipelineRunner) -> None 202 self.runner = runner 203 204 def visit_transform(self, transform_node): 205 try: 206 self.runner.run_transform(transform_node, options) 207 except: 208 _LOGGER.error('Error while visiting %s', transform_node.full_label) 209 raise 210 211 pipeline.visit(RunVisitor(self)) 212 213 def apply_PTransform(self, transform, input, options): 214 # The base case of apply is to call the transform's expand. 215 return transform.expand(input) 216 217 def run_transform(self, 218 transform_node, # type: AppliedPTransform 219 options # type: PipelineOptions 220 ): 221 """Runner callback for a pipeline.run call. 222 223 Args: 224 transform_node: transform node for the transform to run. 225 226 A concrete implementation of the Runner class must implement run_Abc for 227 some class Abc in the method resolution order for every non-composite 228 transform Xyz in the pipeline. 229 """ 230 for cls in transform_node.transform.__class__.mro(): 231 m = getattr(self, 'run_%s' % cls.__name__, None) 232 if m: 233 return m(transform_node, options) 234 raise NotImplementedError( 235 'Execution of [%s] not implemented in runner %s.' % 236 (transform_node.transform, self)) 237 238 def is_fnapi_compatible(self): 239 """Whether to enable the beam_fn_api experiment by default.""" 240 return True 241 242 243 class PValueCache(object): 244 """For internal use only; no backwards-compatibility guarantees. 245 246 Local cache for arbitrary information computed for PValue objects.""" 247 def __init__(self, use_disk_backed_cache=False): 248 # Cache of values computed while a runner executes a pipeline. This is a 249 # dictionary of PValues and their computed values. Note that in principle 250 # the runner could contain PValues from several pipelines without clashes 251 # since a PValue is associated with one and only one pipeline. The keys of 252 # the dictionary are tuple of PValue instance addresses obtained using id() 253 # and tag names converted to strings. 254 255 self._use_disk_backed_cache = use_disk_backed_cache 256 if use_disk_backed_cache: 257 self._tempdir = tempfile.mkdtemp() 258 self._cache = shelve.open(os.path.join(self._tempdir, 'shelve')) 259 else: 260 self._cache = {} 261 262 def __del__(self): 263 if self._use_disk_backed_cache: 264 self._cache.close() 265 shutil.rmtree(self._tempdir) 266 267 def __len__(self): 268 return len(self._cache) 269 270 def to_cache_key(self, transform, tag): 271 return transform.full_label, tag 272 273 def _ensure_pvalue_has_real_producer(self, pvalue): 274 """Ensure the passed-in PValue has the real_producer attribute. 275 276 Args: 277 pvalue: A PValue instance whose cached value is requested. 278 279 During the runner's execution only the results of the primitive transforms 280 are cached. Whenever we are looking for a PValue that is the output of a 281 composite transform we need to find the output of its rightmost transform 282 part. 283 """ 284 if not hasattr(pvalue, 'real_producer'): 285 real_producer = pvalue.producer 286 while real_producer.parts: 287 real_producer = real_producer.parts[-1] 288 pvalue.real_producer = real_producer 289 290 def is_cached(self, pobj): 291 from apache_beam.pipeline import AppliedPTransform 292 if isinstance(pobj, AppliedPTransform): 293 transform = pobj 294 tag = None 295 else: 296 self._ensure_pvalue_has_real_producer(pobj) 297 transform = pobj.real_producer 298 tag = pobj.tag 299 return self.to_cache_key(transform, tag) in self._cache 300 301 def cache_output(self, transform, tag_or_value, value=None): 302 if value is None: 303 value = tag_or_value 304 tag = None 305 else: 306 tag = tag_or_value 307 self._cache[self.to_cache_key(transform, tag)] = value 308 309 def get_pvalue(self, pvalue): 310 """Gets the value associated with a PValue from the cache.""" 311 self._ensure_pvalue_has_real_producer(pvalue) 312 try: 313 return self._cache[self.key(pvalue)] 314 except KeyError: 315 if (pvalue.tag is not None and 316 self.to_cache_key(pvalue.real_producer, None) in self._cache): 317 # This is an undeclared, empty output of a DoFn executed 318 # in the local runner before this output was referenced. 319 return [] 320 else: 321 raise 322 323 def get_unwindowed_pvalue(self, pvalue): 324 return [v.value for v in self.get_pvalue(pvalue)] 325 326 def clear_pvalue(self, pvalue): 327 """Removes a PValue from the cache.""" 328 if self.is_cached(pvalue): 329 del self._cache[self.key(pvalue)] 330 331 def key(self, pobj): 332 self._ensure_pvalue_has_real_producer(pobj) 333 return self.to_cache_key(pobj.real_producer, pobj.tag) 334 335 336 # FIXME: replace with PipelineState(str, enum.Enum) 337 class PipelineState(object): 338 """State of the Pipeline, as returned by :attr:`PipelineResult.state`. 339 340 This is meant to be the union of all the states any runner can put a 341 pipeline in. Currently, it represents the values of the dataflow 342 API JobState enum. 343 """ 344 UNKNOWN = 'UNKNOWN' # not specified by a runner, or unknown to a runner. 345 STARTING = 'STARTING' # not yet started 346 STOPPED = 'STOPPED' # paused or not yet started 347 RUNNING = 'RUNNING' # currently running 348 DONE = 'DONE' # successfully completed (terminal state) 349 FAILED = 'FAILED' # failed (terminal state) 350 CANCELLED = 'CANCELLED' # explicitly cancelled (terminal state) 351 UPDATED = 'UPDATED' # replaced by another job (terminal state) 352 DRAINING = 'DRAINING' # still processing, no longer reading data 353 DRAINED = 'DRAINED' # draining completed (terminal state) 354 PENDING = 'PENDING' # the job has been created but is not yet running. 355 CANCELLING = 'CANCELLING' # job has been explicitly cancelled and is 356 # in the process of stopping 357 RESOURCE_CLEANING_UP = 'RESOURCE_CLEANING_UP' # job's resources are being 358 # cleaned up 359 UNRECOGNIZED = 'UNRECOGNIZED' # the job state reported by a runner cannot be 360 # interpreted by the SDK. 361 362 @classmethod 363 def is_terminal(cls, state): 364 return state in [ 365 cls.DONE, cls.FAILED, cls.CANCELLED, cls.UPDATED, cls.DRAINED 366 ] 367 368 369 class PipelineResult(object): 370 """A :class:`PipelineResult` provides access to info about a pipeline.""" 371 def __init__(self, state): 372 self._state = state 373 374 @property 375 def state(self): 376 """Return the current state of the pipeline execution.""" 377 return self._state 378 379 def wait_until_finish(self, duration=None): 380 """Waits until the pipeline finishes and returns the final status. 381 382 Args: 383 duration (int): The time to wait (in milliseconds) for job to finish. 384 If it is set to :data:`None`, it will wait indefinitely until the job 385 is finished. 386 387 Raises: 388 IOError: If there is a persistent problem getting job 389 information. 390 NotImplementedError: If the runner does not support this 391 operation. 392 393 Returns: 394 The final state of the pipeline, or :data:`None` on timeout. 395 """ 396 raise NotImplementedError 397 398 def cancel(self): 399 """Cancels the pipeline execution. 400 401 Raises: 402 IOError: If there is a persistent problem getting job 403 information. 404 NotImplementedError: If the runner does not support this 405 operation. 406 407 Returns: 408 The final state of the pipeline. 409 """ 410 raise NotImplementedError 411 412 def metrics(self): 413 """Returns :class:`~apache_beam.metrics.metric.MetricResults` object to 414 query metrics from the runner. 415 416 Raises: 417 NotImplementedError: If the runner does not support this 418 operation. 419 """ 420 raise NotImplementedError 421 422 # pylint: disable=unused-argument 423 def aggregated_values(self, aggregator_or_name): 424 """Return a dict of step names to values of the Aggregator.""" 425 _LOGGER.warning( 426 '%s does not implement aggregated_values', self.__class__.__name__) 427 return {}