github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/background_caching_job.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Module to build and run background source recording jobs. 19 20 For internal use only; no backwards-compatibility guarantees. 21 22 A background source recording job is a job that records events for all 23 recordable sources of a given pipeline. With Interactive Beam, one such job is 24 started when a pipeline run happens (which produces a main job in contrast to 25 the background source recording job) and meets the following conditions: 26 27 #. The pipeline contains recordable sources, configured through 28 interactive_beam.options.recordable_sources. 29 #. No such background job is running. 30 #. No such background job has completed successfully and the cached events are 31 still valid (invalidated when recordable sources change in the pipeline). 32 33 Once started, the background source recording job runs asynchronously until it 34 hits some recording limit configured in interactive_beam.options. Meanwhile, 35 the main job and future main jobs from the pipeline will run using the 36 deterministic replayable recorded events until they are invalidated. 37 """ 38 39 # pytype: skip-file 40 41 import logging 42 import threading 43 import time 44 45 import apache_beam as beam 46 from apache_beam.runners.interactive import interactive_environment as ie 47 from apache_beam.runners.interactive import utils 48 from apache_beam.runners.interactive.caching import streaming_cache 49 from apache_beam.runners.runner import PipelineState 50 51 _LOGGER = logging.getLogger(__name__) 52 53 54 class BackgroundCachingJob(object): 55 """A simple abstraction that controls necessary components of a timed and 56 space limited background source recording job. 57 58 A background source recording job successfully completes source data 59 recording in 2 conditions: 60 61 #. The job is finite and runs into DONE state; 62 #. The job is infinite but hits an interactive_beam.options configured limit 63 and gets cancelled into CANCELLED/CANCELLING state. 64 65 In both situations, the background source recording job should be treated as 66 done successfully. 67 """ 68 def __init__(self, pipeline_result, limiters): 69 self._pipeline_result = pipeline_result 70 self._result_lock = threading.RLock() 71 self._condition_checker = threading.Thread( 72 target=self._background_caching_job_condition_checker, daemon=True) 73 74 # Limiters are checks s.t. if any are triggered then the background caching 75 # job gets cancelled. 76 self._limiters = limiters 77 self._condition_checker.start() 78 79 def _background_caching_job_condition_checker(self): 80 while True: 81 with self._result_lock: 82 if PipelineState.is_terminal(self._pipeline_result.state): 83 break 84 85 if self._should_end_condition_checker(): 86 self.cancel() 87 break 88 time.sleep(0.5) 89 90 def _should_end_condition_checker(self): 91 return any(l.is_triggered() for l in self._limiters) 92 93 def is_done(self): 94 with self._result_lock: 95 is_terminated = self._pipeline_result.state in ( 96 PipelineState.DONE, PipelineState.CANCELLED) 97 is_triggered = self._should_end_condition_checker() 98 is_cancelling = self._pipeline_result.state is PipelineState.CANCELLING 99 return is_terminated or (is_triggered and is_cancelling) 100 101 def is_running(self): 102 with self._result_lock: 103 return self._pipeline_result.state is PipelineState.RUNNING 104 105 def cancel(self): 106 """Cancels this background source recording job. 107 """ 108 with self._result_lock: 109 if not PipelineState.is_terminal(self._pipeline_result.state): 110 try: 111 self._pipeline_result.cancel() 112 except NotImplementedError: 113 # Ignore the cancel invocation if it is never implemented by the 114 # runner. 115 pass 116 117 @property 118 def state(self): 119 with self._result_lock: 120 return self._pipeline_result.state 121 122 123 def attempt_to_run_background_caching_job( 124 runner, user_pipeline, options=None, limiters=None): 125 """Attempts to run a background source recording job for a user-defined 126 pipeline. 127 128 Returns True if a job was started, False otherwise. 129 130 The pipeline result is automatically tracked by Interactive Beam in case 131 future cancellation/cleanup is needed. 132 """ 133 if is_background_caching_job_needed(user_pipeline): 134 # Cancel non-terminal jobs if there is any before starting a new one. 135 attempt_to_cancel_background_caching_job(user_pipeline) 136 # Cancel the gRPC server serving the test stream if there is one. 137 attempt_to_stop_test_stream_service(user_pipeline) 138 # TODO(BEAM-8335): refactor background source recording job logic from 139 # pipeline_instrument module to this module and aggregate tests. 140 from apache_beam.runners.interactive import pipeline_instrument as instr 141 runner_pipeline = beam.pipeline.Pipeline.from_runner_api( 142 user_pipeline.to_runner_api(), runner, options) 143 ie.current_env().add_derived_pipeline(user_pipeline, runner_pipeline) 144 background_caching_job_result = beam.pipeline.Pipeline.from_runner_api( 145 instr.build_pipeline_instrument( 146 runner_pipeline).background_caching_pipeline_proto(), 147 runner, 148 options).run() 149 150 recording_limiters = ( 151 limiters 152 if limiters else ie.current_env().options.capture_control.limiters()) 153 ie.current_env().set_background_caching_job( 154 user_pipeline, 155 BackgroundCachingJob( 156 background_caching_job_result, limiters=recording_limiters)) 157 return True 158 return False 159 160 161 def is_background_caching_job_needed(user_pipeline): 162 """Determines if a background source recording job needs to be started. 163 164 It does several state checks and recording state changes throughout the 165 process. It is not idempotent to simplify the usage. 166 """ 167 job = ie.current_env().get_background_caching_job(user_pipeline) 168 # Checks if the pipeline contains any source that needs to be cached. 169 need_cache = has_source_to_cache(user_pipeline) 170 # If this is True, we can invalidate a previous done/running job if there is 171 # one. 172 cache_changed = is_source_to_cache_changed(user_pipeline) 173 # When recording replay is disabled, cache is always needed for recordable 174 # sources (if any). 175 if need_cache and not ie.current_env().options.enable_recording_replay: 176 from apache_beam.runners.interactive.options import capture_control 177 capture_control.evict_captured_data() 178 return True 179 return ( 180 need_cache and 181 # Checks if it's the first time running a job from the pipeline. 182 ( 183 not job or 184 # Or checks if there is no previous job. 185 # DONE means a previous job has completed successfully and the 186 # cached events might still be valid. 187 not ( 188 job.is_done() or 189 # RUNNING means a previous job has been started and is still 190 # running. 191 job.is_running()) or 192 # Or checks if we can invalidate the previous job. 193 cache_changed)) 194 195 196 def is_cache_complete(pipeline_id): 197 # type: (str) -> bool 198 199 """Returns True if the backgrond cache for the given pipeline is done. 200 """ 201 user_pipeline = ie.current_env().pipeline_id_to_pipeline(pipeline_id) 202 job = ie.current_env().get_background_caching_job(user_pipeline) 203 is_done = job and job.is_done() 204 cache_changed = is_source_to_cache_changed( 205 user_pipeline, update_cached_source_signature=False) 206 207 # Stop reading from the cache if the background job is done or the underlying 208 # cache signature changed that requires a new background source recording job. 209 return is_done or cache_changed 210 211 212 def has_source_to_cache(user_pipeline): 213 """Determines if a user-defined pipeline contains any source that need to be 214 cached. If so, also immediately wrap current cache manager held by current 215 interactive environment into a streaming cache if this has not been done. 216 The wrapping doesn't invalidate existing cache in any way. 217 218 This can help determining if a background source recording job is needed to 219 write cache for sources and if a test stream service is needed to serve the 220 cache. 221 222 Throughout the check, if source-to-cache has changed from the last check, it 223 also cleans up the invalidated cache early on. 224 """ 225 # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources. 226 # Add logic for other cacheable sources here when they are available. 227 has_cache = utils.has_unbounded_sources(user_pipeline) 228 if has_cache: 229 if not isinstance(ie.current_env().get_cache_manager(user_pipeline, 230 create_if_absent=True), 231 streaming_cache.StreamingCache): 232 233 file_based_cm = ie.current_env().get_cache_manager(user_pipeline) 234 cache_dir = file_based_cm._cache_dir 235 cache_root = ie.current_env().options.cache_root 236 if cache_root: 237 if cache_root.startswith('gs://'): 238 raise ValueError( 239 'GCS cache paths are not currently supported for ' 240 'streaming pipelines.') 241 cache_dir = cache_root 242 ie.current_env().set_cache_manager( 243 streaming_cache.StreamingCache( 244 cache_dir, 245 is_cache_complete=is_cache_complete, 246 sample_resolution_sec=1.0, 247 saved_pcoders=file_based_cm._saved_pcoders), 248 user_pipeline) 249 return has_cache 250 251 252 def attempt_to_cancel_background_caching_job(user_pipeline): 253 """Attempts to cancel background source recording job for a user-defined 254 pipeline. 255 256 If no background source recording job needs to be cancelled, NOOP. Otherwise, 257 cancel such job. 258 """ 259 job = ie.current_env().get_background_caching_job(user_pipeline) 260 if job: 261 job.cancel() 262 263 264 def attempt_to_stop_test_stream_service(user_pipeline): 265 """Attempts to stop the gRPC server/service serving the test stream. 266 267 If there is no such server started, NOOP. Otherwise, stop it. 268 """ 269 if is_a_test_stream_service_running(user_pipeline): 270 ie.current_env().evict_test_stream_service_controller(user_pipeline).stop() 271 272 273 def is_a_test_stream_service_running(user_pipeline): 274 """Checks to see if there is a gPRC server/service running that serves the 275 test stream to any job started from the given user_pipeline. 276 """ 277 return ie.current_env().get_test_stream_service_controller( 278 user_pipeline) is not None 279 280 281 def is_source_to_cache_changed( 282 user_pipeline, update_cached_source_signature=True): 283 """Determines if there is any change in the sources that need to be cached 284 used by the user-defined pipeline. 285 286 Due to the expensiveness of computations and for the simplicity of usage, this 287 function is not idempotent because Interactive Beam automatically discards 288 previously tracked signature of transforms and tracks the current signature of 289 transforms for the user-defined pipeline if there is any change. 290 291 When it's True, there is addition/deletion/mutation of source transforms that 292 requires a new background source recording job. 293 """ 294 # By default gets empty set if the user_pipeline is first time seen because 295 # we can treat it as adding transforms. 296 recorded_signature = ie.current_env().get_cached_source_signature( 297 user_pipeline) 298 current_signature = extract_source_to_cache_signature(user_pipeline) 299 is_changed = not current_signature.issubset(recorded_signature) 300 # The computation of extract_unbounded_source_signature is expensive, track on 301 # change by default. 302 if is_changed and update_cached_source_signature: 303 options = ie.current_env().options 304 # No info needed when recording replay is disabled. 305 if options.enable_recording_replay: 306 if not recorded_signature: 307 308 def sizeof_fmt(num, suffix='B'): 309 for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: 310 if abs(num) < 1000.0: 311 return "%3.1f%s%s" % (num, unit, suffix) 312 num /= 1000.0 313 return "%.1f%s%s" % (num, 'Yi', suffix) 314 315 _LOGGER.info( 316 'Interactive Beam has detected unbounded sources in your pipeline. ' 317 'In order to have a deterministic replay, a segment of data will ' 318 'be recorded from all sources for %s seconds or until a total of ' 319 '%s have been written to disk.', 320 options.recording_duration.total_seconds(), 321 sizeof_fmt(options.recording_size_limit)) 322 else: 323 _LOGGER.info( 324 'Interactive Beam has detected a new streaming source was ' 325 'added to the pipeline. In order for the cached streaming ' 326 'data to start at the same time, all recorded data has been ' 327 'cleared and a new segment of data will be recorded.') 328 329 ie.current_env().cleanup(user_pipeline) 330 ie.current_env().set_cached_source_signature( 331 user_pipeline, current_signature) 332 ie.current_env().add_user_pipeline(user_pipeline) 333 return is_changed 334 335 336 def extract_source_to_cache_signature(user_pipeline): 337 """Extracts a set of signature for sources that need to be cached in the 338 user-defined pipeline. 339 340 A signature is a str representation of urn and payload of a source. 341 """ 342 # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources. 343 # Add logic for other cacheable sources here when they are available. 344 unbounded_sources_as_applied_transforms = utils.unbounded_sources( 345 user_pipeline) 346 unbounded_sources_as_ptransforms = set( 347 map(lambda x: x.transform, unbounded_sources_as_applied_transforms)) 348 _, context = user_pipeline.to_runner_api(return_context=True) 349 signature = set( 350 map( 351 lambda transform: str(transform.to_runner_api(context)), 352 unbounded_sources_as_ptransforms)) 353 return signature