github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Nexmark launcher. 19 20 The Nexmark suite is a series of queries (streaming pipelines) performed 21 on a simulation of auction events. The launcher orchestrates the generation 22 and parsing of streaming events and the running of queries. 23 24 Model 25 - Person: Author of an auction or a bid. 26 - Auction: Item under auction. 27 - Bid: A bid for an item under auction. 28 29 Events 30 - Create Person 31 - Create Auction 32 - Create Bid 33 34 Queries 35 - Query0: Pass through (send and receive auction events). 36 37 Usage 38 - DirectRunner 39 python nexmark_launcher.py \ 40 --query/q <query number> \ 41 --project <project id> \ 42 --loglevel=DEBUG (optional) \ 43 --wait_until_finish_duration <time_in_ms> \ 44 --streaming 45 46 - DataflowRunner 47 python nexmark_launcher.py \ 48 --query/q <query number> \ 49 --project <project id> \ 50 --region <GCE region> \ 51 --loglevel=DEBUG (optional) \ 52 --wait_until_finish_duration <time_in_ms> \ 53 --streaming \ 54 --sdk_location <apache_beam tar.gz> \ 55 --staging_location=gs://... \ 56 --temp_location=gs:// 57 58 """ 59 60 # pytype: skip-file 61 62 import argparse 63 import json 64 import logging 65 import os 66 import time 67 import uuid 68 69 import requests 70 from requests.auth import HTTPBasicAuth 71 72 import apache_beam as beam 73 from apache_beam.options.pipeline_options import GoogleCloudOptions 74 from apache_beam.options.pipeline_options import PipelineOptions 75 from apache_beam.options.pipeline_options import SetupOptions 76 from apache_beam.options.pipeline_options import StandardOptions 77 from apache_beam.options.pipeline_options import TypeOptions 78 from apache_beam.runners import PipelineState 79 from apache_beam.testing.benchmarks.nexmark import nexmark_util 80 from apache_beam.testing.benchmarks.nexmark.monitor import Monitor 81 from apache_beam.testing.benchmarks.nexmark.monitor import MonitorSuffix 82 from apache_beam.testing.benchmarks.nexmark.nexmark_perf import NexmarkPerf 83 from apache_beam.testing.benchmarks.nexmark.queries import query0 84 from apache_beam.testing.benchmarks.nexmark.queries import query1 85 from apache_beam.testing.benchmarks.nexmark.queries import query2 86 from apache_beam.testing.benchmarks.nexmark.queries import query3 87 from apache_beam.testing.benchmarks.nexmark.queries import query4 88 from apache_beam.testing.benchmarks.nexmark.queries import query5 89 from apache_beam.testing.benchmarks.nexmark.queries import query6 90 from apache_beam.testing.benchmarks.nexmark.queries import query7 91 from apache_beam.testing.benchmarks.nexmark.queries import query8 92 from apache_beam.testing.benchmarks.nexmark.queries import query9 93 from apache_beam.testing.benchmarks.nexmark.queries import query10 94 from apache_beam.testing.benchmarks.nexmark.queries import query11 95 from apache_beam.testing.benchmarks.nexmark.queries import query12 96 from apache_beam.transforms import window 97 98 99 class NexmarkLauncher(object): 100 101 # how long after some result is seen and no activity seen do we cancel job 102 DONE_DELAY = 5 * 60 103 # delay in seconds between sample perf data 104 PERF_DELAY = 20 105 # delay before cancelling the job when pipeline appears to be stuck 106 TERMINATE_DELAY = 1 * 60 * 60 107 # delay before warning when pipeline appears to be stuck 108 WARNING_DELAY = 10 * 60 109 110 def __init__(self): 111 self.parse_args() 112 self.manage_resources = self.args.manage_resources 113 self.uuid = str(uuid.uuid4()) if self.manage_resources else '' 114 self.topic_name = ( 115 self.args.topic_name + self.uuid if self.args.topic_name else None) 116 self.subscription_name = ( 117 self.args.subscription_name + 118 self.uuid if self.args.subscription_name else None) 119 self.pubsub_mode = self.args.pubsub_mode 120 if self.manage_resources: 121 from google.cloud import pubsub 122 self.cleanup() 123 publish_client = pubsub.Client(project=self.project) 124 topic = publish_client.topic(self.topic_name) 125 logging.info('creating topic %s', self.topic_name) 126 topic.create() 127 sub = topic.subscription(self.subscription_name) 128 logging.info('creating sub %s', self.topic_name) 129 sub.create() 130 131 self.export_influxdb = self.args.export_summary_to_influx_db 132 if self.export_influxdb: 133 self.influx_database = self.args.influx_database 134 self.influx_host = self.args.influx_host 135 self.influx_base = self.args.base_influx_measurement 136 self.influx_retention = self.args.influx_retention_policy 137 138 def parse_args(self): 139 parser = argparse.ArgumentParser() 140 141 parser.add_argument( 142 '--query', 143 '-q', 144 type=int, 145 action='append', 146 required=True, 147 choices=[i for i in range(13)], 148 help='Query to run') 149 150 parser.add_argument( 151 '--subscription_name', 152 type=str, 153 help='Pub/Sub subscription to read from') 154 155 parser.add_argument( 156 '--topic_name', type=str, help='Pub/Sub topic to read from') 157 158 parser.add_argument( 159 '--loglevel', 160 choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 161 default='INFO', 162 help='Set logging level to debug') 163 parser.add_argument( 164 '--input', 165 type=str, 166 help='Path to the data file containing nexmark events.') 167 parser.add_argument( 168 '--num_events', 169 type=int, 170 default=100000, 171 help='number of events expected to process') 172 parser.add_argument( 173 '--manage_resources', 174 default=False, 175 action='store_true', 176 help='If true, manage the creation and cleanup of topics and ' 177 'subscriptions.') 178 parser.add_argument( 179 '--pubsub_mode', 180 type=str, 181 default='SUBSCRIBE_ONLY', 182 choices=['PUBLISH_ONLY', 'SUBSCRIBE_ONLY', 'COMBINED'], 183 help='Pubsub mode used in the pipeline.') 184 185 parser.add_argument( 186 '--export_summary_to_influx_db', 187 default=False, 188 action='store_true', 189 help='If set store results in influxdb') 190 parser.add_argument( 191 '--influx_database', 192 type=str, 193 default='beam_test_metrics', 194 help='Influx database name') 195 parser.add_argument( 196 '--influx_host', 197 type=str, 198 default='http://localhost:8086', 199 help='Influx database url') 200 parser.add_argument( 201 '--base_influx_measurement', 202 type=str, 203 default='nexmark', 204 help='Prefix to influx measurement') 205 parser.add_argument( 206 '--influx_retention_policy', 207 type=str, 208 default='forever', 209 help='Retention policy for stored results') 210 211 self.args, self.pipeline_args = parser.parse_known_args() 212 logging.basicConfig( 213 level=getattr(logging, self.args.loglevel, None), 214 format='(%(threadName)-10s) %(message)s') 215 216 self.pipeline_options = PipelineOptions(self.pipeline_args) 217 logging.debug('args, pipeline_args: %s, %s', self.args, self.pipeline_args) 218 219 # Usage with Dataflow requires a project to be supplied. 220 self.project = self.pipeline_options.view_as(GoogleCloudOptions).project 221 self.streaming = self.pipeline_options.view_as(StandardOptions).streaming 222 self.pipeline_options.view_as(TypeOptions).allow_unsafe_triggers = True 223 224 if self.streaming: 225 if self.args.subscription_name is None or self.project is None: 226 raise ValueError( 227 'argument --subscription_name and --project ' + 228 'are required when running in streaming mode') 229 else: 230 if self.args.input is None: 231 raise ValueError( 232 'argument --input is required when running in batch mode') 233 234 # We use the save_main_session option because one or more DoFn's in this 235 # workflow rely on global context (e.g., a module imported at module level). 236 self.pipeline_options.view_as(SetupOptions).save_main_session = True 237 238 def generate_events(self): 239 from google.cloud import pubsub 240 publish_client = pubsub.Client(project=self.project) 241 topic = publish_client.topic(self.topic_name) 242 243 logging.info('Generating auction events to topic %s', topic.name) 244 245 if self.args.input.startswith('gs://'): 246 from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem 247 fs = GCSFileSystem(self.pipeline_options) 248 with fs.open(self.args.input) as infile: 249 for line in infile: 250 topic.publish(line) 251 else: 252 with open(self.args.input) as infile: 253 for line in infile: 254 topic.publish(line) 255 256 logging.info('Finished event generation.') 257 258 def read_from_file(self): 259 return ( 260 self.pipeline 261 | 'reading_from_file' >> beam.io.ReadFromText(self.args.input) 262 | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn()) 263 | 'timestamping' >> 264 beam.Map(lambda e: window.TimestampedValue(e, e.date_time))) 265 266 def read_from_pubsub(self): 267 # Read from PubSub into a PCollection. 268 if self.subscription_name: 269 raw_events = self.pipeline | 'ReadPubSub_sub' >> beam.io.ReadFromPubSub( 270 subscription=self.subscription_name, 271 with_attributes=True, 272 timestamp_attribute='timestamp') 273 else: 274 raw_events = self.pipeline | 'ReadPubSub_topic' >> beam.io.ReadFromPubSub( 275 topic=self.topic_name, 276 with_attributes=True, 277 timestamp_attribute='timestamp') 278 events = ( 279 raw_events 280 | 'pubsub_unwrap' >> beam.Map(lambda m: m.data) 281 | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn())) 282 return events 283 284 def run_query( 285 self, query_num, query, query_args, pipeline_options, query_errors): 286 try: 287 self.pipeline = beam.Pipeline(options=self.pipeline_options) 288 nexmark_util.setup_coder() 289 290 event_monitor = Monitor('.events', 'event') 291 result_monitor = Monitor('.results', 'result') 292 293 if self.streaming: 294 if self.pubsub_mode != 'SUBSCRIBE_ONLY': 295 self.generate_events() 296 if self.pubsub_mode == 'PUBLISH_ONLY': 297 return 298 events = self.read_from_pubsub() 299 else: 300 events = self.read_from_file() 301 302 events = events | 'event_monitor' >> beam.ParDo(event_monitor.doFn) 303 output = query.load(events, query_args, pipeline_options) 304 output | 'result_monitor' >> beam.ParDo(result_monitor.doFn) # pylint: disable=expression-not-assigned 305 306 result = self.pipeline.run() 307 if not self.streaming: 308 result.wait_until_finish() 309 perf = self.monitor(result, event_monitor, result_monitor) 310 self.log_performance(perf) 311 if self.export_influxdb: 312 self.publish_performance_influxdb(query_num, perf) 313 314 except Exception as exc: 315 query_errors.append(str(exc)) 316 raise 317 318 def monitor(self, job, event_monitor, result_monitor): 319 """ 320 keep monitoring the performance and progress of running job and cancel 321 the job if the job is stuck or seems to have finished running 322 323 Returns: 324 the final performance if it is measured 325 """ 326 logging.info('starting to monitor the job') 327 last_active_ms = -1 328 perf = None 329 cancel_job = False 330 waiting_for_shutdown = False 331 332 while True: 333 now = int(time.time() * 1000) # current time in ms 334 logging.debug('now is %d', now) 335 336 curr_perf = NexmarkLauncher.get_performance( 337 job, event_monitor, result_monitor) 338 if perf is None or curr_perf.has_progress(perf): 339 last_active_ms = now 340 341 # only judge if the job should be cancelled if it is streaming job and 342 # has not been shut down already 343 if self.streaming and not waiting_for_shutdown: 344 quiet_duration = (now - last_active_ms) // 1000 345 if (curr_perf.event_count >= self.args.num_events and 346 curr_perf.result_count >= 0 and quiet_duration > self.DONE_DELAY): 347 # we think the job is finished if expected input count has been seen 348 # and no new results have been produced for a while 349 logging.info('streaming query appears to have finished executing') 350 waiting_for_shutdown = True 351 cancel_job = True 352 elif quiet_duration > self.TERMINATE_DELAY: 353 logging.error( 354 'streaming query have been stuck for %d seconds', quiet_duration) 355 logging.error('canceling streaming job') 356 waiting_for_shutdown = True 357 cancel_job = True 358 elif quiet_duration > self.WARNING_DELAY: 359 logging.warning( 360 'streaming query have been stuck for %d seconds', quiet_duration) 361 362 if cancel_job: 363 job.cancel() 364 365 perf = curr_perf 366 367 stopped = PipelineState.is_terminal(job.state) 368 if stopped: 369 break 370 371 if not waiting_for_shutdown: 372 if last_active_ms == now: 373 logging.info('activity seen, new performance data extracted') 374 else: 375 logging.info('no activity seen') 376 else: 377 logging.info('waiting for shutdown') 378 379 time.sleep(self.PERF_DELAY) 380 381 return perf 382 383 @staticmethod 384 def log_performance(perf): 385 # type: (NexmarkPerf) -> None 386 logging.info( 387 'input event count: %d, output event count: %d' % 388 (perf.event_count, perf.result_count)) 389 logging.info( 390 'query run took %.1f seconds and processed %.1f events per second' % 391 (perf.runtime_sec, perf.event_per_sec)) 392 393 def publish_performance_influxdb(self, query_num, perf): 394 processingMode = "streaming" if self.streaming else "batch" 395 measurement = "%s_%d_python_%s" % ( 396 self.influx_base, query_num, processingMode) 397 398 tags = {'runner': self.pipeline_options.view_as(StandardOptions).runner} 399 400 mt = ','.join([measurement] + [k + "=" + v for k, v in tags.items()]) 401 402 fields = { 403 'numResults': "%di" % (perf.result_count), 404 'runtimeMs': "%di" % (perf.runtime_sec * 1000), 405 } 406 407 ts = int(time.time()) 408 payload = '\n'.join( 409 ["%s %s=%s %d" % (mt, k, v, ts) for k, v in fields.items()]) 410 411 url = '%s/write' % (self.influx_host) 412 query_str = { 413 'db': self.influx_database, 414 'rp': self.influx_retention, 415 'precision': 's', 416 } 417 418 user = os.getenv('INFLUXDB_USER') 419 password = os.getenv('INFLUXDB_USER_PASSWORD') 420 auth = HTTPBasicAuth(user, password) 421 422 try: 423 response = requests.post(url, params=query_str, data=payload, auth=auth) 424 except requests.exceptions.RequestException as e: 425 logging.warning('Failed to publish metrics to InfluxDB: ' + str(e)) 426 else: 427 if response.status_code != 204: 428 content = json.loads(response.content) 429 logging.warning( 430 'Failed to publish metrics to InfluxDB. Received status code %s ' 431 'with an error message: %s' % 432 (response.status_code, content['error'])) 433 434 @staticmethod 435 def get_performance(result, event_monitor, result_monitor): 436 event_count = nexmark_util.get_counter_metric( 437 result, 438 event_monitor.namespace, 439 event_monitor.name_prefix + MonitorSuffix.ELEMENT_COUNTER) 440 event_start = nexmark_util.get_start_time_metric( 441 result, 442 event_monitor.namespace, 443 event_monitor.name_prefix + MonitorSuffix.EVENT_TIME) 444 event_end = nexmark_util.get_end_time_metric( 445 result, 446 event_monitor.namespace, 447 event_monitor.name_prefix + MonitorSuffix.EVENT_TIME) 448 result_count = nexmark_util.get_counter_metric( 449 result, 450 result_monitor.namespace, 451 result_monitor.name_prefix + MonitorSuffix.ELEMENT_COUNTER) 452 result_end = nexmark_util.get_end_time_metric( 453 result, 454 result_monitor.namespace, 455 result_monitor.name_prefix + MonitorSuffix.EVENT_TIME) 456 457 perf = NexmarkPerf() 458 perf.event_count = event_count 459 perf.result_count = result_count 460 effective_end = max(event_end, result_end) 461 if effective_end >= 0 and event_start >= 0: 462 perf.runtime_sec = (effective_end - event_start) / 1000 463 if event_count >= 0 and perf.runtime_sec > 0: 464 perf.event_per_sec = event_count / perf.runtime_sec 465 466 return perf 467 468 def cleanup(self): 469 if self.manage_resources: 470 from google.cloud import pubsub 471 publish_client = pubsub.Client(project=self.project) 472 topic = publish_client.topic(self.topic_name) 473 if topic.exists(): 474 logging.info('deleting topic %s', self.topic_name) 475 topic.delete() 476 sub = topic.subscription(self.subscription_name) 477 if sub.exists(): 478 logging.info('deleting sub %s', self.topic_name) 479 sub.delete() 480 481 def run(self): 482 queries = { 483 0: query0, 484 1: query1, 485 2: query2, 486 3: query3, 487 4: query4, 488 5: query5, 489 6: query6, 490 7: query7, 491 8: query8, 492 9: query9, 493 10: query10, 494 11: query11, 495 12: query12 496 } 497 498 # TODO(mariagh): Move to a config file. 499 query_args = { 500 'auction_skip': 123, 501 'window_size_sec': 10, 502 'window_period_sec': 5, 503 'fanout': 5, 504 'num_max_workers': 5, 505 'max_log_events': 100000, 506 'occasional_delay_sec': 3, 507 'max_auction_waiting_time': 600 508 } 509 510 query_errors = [] 511 for i in self.args.query: 512 logging.info('Running query %d', i) 513 self.run_query( 514 i, 515 queries[i], 516 query_args, 517 self.pipeline_options, 518 query_errors=query_errors) 519 520 if query_errors: 521 logging.error('Query failed with %s', ', '.join(query_errors)) 522 else: 523 logging.info('Queries run: %s', self.args.query) 524 525 526 if __name__ == '__main__': 527 launcher = NexmarkLauncher() 528 launcher.run() 529 launcher.cleanup()