github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/nexmark_util.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Utilities for the Nexmark suite. 19 20 The Nexmark suite is a series of queries (streaming pipelines) performed 21 on a simulation of auction events. This util includes: 22 23 - A Command class used to terminate the streaming jobs 24 launched in nexmark_launcher.py by the DirectRunner. 25 - A ParseEventFn DoFn to parse events received from PubSub. 26 27 Usage: 28 29 To run a process for a certain duration, define in the code: 30 command = Command(process_to_terminate, args) 31 command.run(timeout=duration) 32 33 """ 34 35 # pytype: skip-file 36 37 import json 38 import logging 39 import threading 40 41 import apache_beam as beam 42 from apache_beam.metrics import MetricsFilter 43 from apache_beam.runners.runner import PipelineResult # pylint: disable=unused-import 44 from apache_beam.testing.benchmarks.nexmark.models import auction_bid 45 from apache_beam.testing.benchmarks.nexmark.models import nexmark_model 46 from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames 47 from apache_beam.transforms import window 48 from apache_beam.utils.timestamp import Timestamp 49 50 _LOGGER = logging.getLogger(__name__) 51 52 53 class Command(object): 54 def __init__(self, cmd, args): 55 self.cmd = cmd 56 self.args = args 57 58 def run(self, timeout): 59 def thread_target(): 60 logging.debug( 61 'Starting thread for %d seconds: %s', timeout, self.cmd.__name__) 62 63 self.cmd(*self.args) 64 _LOGGER.info( 65 '%d seconds elapsed. Thread (%s) finished.', 66 timeout, 67 self.cmd.__name__) 68 69 thread = threading.Thread(target=thread_target, name='Thread-timeout') 70 thread.daemon = True 71 thread.start() 72 thread.join(timeout) 73 74 75 def setup_coder(): 76 beam.coders.registry.register_coder( 77 nexmark_model.Auction, nexmark_model.AuctionCoder) 78 beam.coders.registry.register_coder( 79 nexmark_model.Person, nexmark_model.PersonCoder) 80 beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder) 81 beam.coders.registry.register_coder( 82 auction_bid.AuctionBid, auction_bid.AuctionBidCoder) 83 84 85 class ParseEventFn(beam.DoFn): 86 """ 87 Original parser for parsing raw events info into a Python objects. 88 89 Each event line has the following format: 90 91 person: <id starting with 'p'>,name,email,credit_card,city, \ 92 state,timestamp,extra 93 auction: <id starting with 'a'>,item_name, description,initial_bid, \ 94 reserve_price,timestamp,expires,seller,category,extra 95 bid: <auction starting with 'b'>,bidder,price,timestamp,extra 96 97 For example: 98 99 'p12345,maria,maria@maria.com,1234-5678-9012-3456, \ 100 sunnyvale,CA,1528098831536' 101 'a12345,car67,2012 hyundai elantra,15000,20000, \ 102 1528098831536,20180630,maria,vehicle' 103 'b12345,maria,20000,1528098831536' 104 """ 105 def process(self, elem): 106 model_dict = { 107 'p': nexmark_model.Person, 108 'a': nexmark_model.Auction, 109 'b': nexmark_model.Bid, 110 } 111 row = elem.split(',') 112 model = model_dict.get(elem[0]) 113 if not model: 114 raise ValueError('Invalid event: %s.' % row) 115 116 event = model(*row) 117 logging.debug('Parsed event: %s', event) 118 yield event 119 120 121 class ParseJsonEventFn(beam.DoFn): 122 """Parses the raw event info into a Python objects. 123 124 Each event line has the following format: 125 126 person: {id,name,email,credit_card,city, \ 127 state,timestamp,extra} 128 auction: {id,item_name, description,initial_bid, \ 129 reserve_price,timestamp,expires,seller,category,extra} 130 bid: {auction,bidder,price,timestamp,extra} 131 132 For example: 133 134 {"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\ 135 "creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\ 136 "dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["} 137 138 {"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\ 139 "initialBid":28873,"reserve":29448,"dateTime":1528098831036,\ 140 "expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"} 141 142 {"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\ 143 "extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"} 144 """ 145 def process(self, elem): 146 json_dict = json.loads(elem) 147 if type(json_dict[FieldNames.DATE_TIME]) is dict: 148 json_dict[FieldNames.DATE_TIME] = json_dict[ 149 FieldNames.DATE_TIME]['millis'] 150 if FieldNames.NAME in json_dict: 151 yield nexmark_model.Person( 152 json_dict[FieldNames.ID], 153 json_dict[FieldNames.NAME], 154 json_dict[FieldNames.EMAIL_ADDRESS], 155 json_dict[FieldNames.CREDIT_CARD], 156 json_dict[FieldNames.CITY], 157 json_dict[FieldNames.STATE], 158 millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), 159 json_dict[FieldNames.EXTRA]) 160 elif FieldNames.ITEM_NAME in json_dict: 161 if type(json_dict[FieldNames.EXPIRES]) is dict: 162 json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis'] 163 yield nexmark_model.Auction( 164 json_dict[FieldNames.ID], 165 json_dict[FieldNames.ITEM_NAME], 166 json_dict[FieldNames.DESCRIPTION], 167 json_dict[FieldNames.INITIAL_BID], 168 json_dict[FieldNames.RESERVE], 169 millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), 170 millis_to_timestamp(json_dict[FieldNames.EXPIRES]), 171 json_dict[FieldNames.SELLER], 172 json_dict[FieldNames.CATEGORY], 173 json_dict[FieldNames.EXTRA]) 174 elif FieldNames.AUCTION in json_dict: 175 yield nexmark_model.Bid( 176 json_dict[FieldNames.AUCTION], 177 json_dict[FieldNames.BIDDER], 178 json_dict[FieldNames.PRICE], 179 millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), 180 json_dict[FieldNames.EXTRA]) 181 else: 182 raise ValueError('Invalid event: %s.' % str(json_dict)) 183 184 185 class CountAndLog(beam.PTransform): 186 def expand(self, pcoll): 187 return ( 188 pcoll 189 | 'window' >> beam.WindowInto(window.GlobalWindows()) 190 | "Count" >> beam.combiners.Count.Globally() 191 | "Log" >> beam.Map(log_count_info)) 192 193 194 def log_count_info(count): 195 logging.info('Query resulted in %d results', count) 196 return count 197 198 199 def display(elm): 200 logging.debug(elm) 201 return elm 202 203 204 def model_to_json(model): 205 return json.dumps(construct_json_dict(model), separators=(',', ':')) 206 207 208 def construct_json_dict(model): 209 return {k: unnest_to_json(v) for k, v in model.__dict__.items()} 210 211 212 def unnest_to_json(cand): 213 if isinstance(cand, Timestamp): 214 return cand.micros // 1000 215 elif isinstance( 216 cand, (nexmark_model.Auction, nexmark_model.Bid, nexmark_model.Person)): 217 return construct_json_dict(cand) 218 else: 219 return cand 220 221 222 def millis_to_timestamp(millis): 223 # type: (int) -> Timestamp 224 micro_second = millis * 1000 225 return Timestamp(micros=micro_second) 226 227 228 def get_counter_metric(result, namespace, name): 229 # type: (PipelineResult, str, str) -> int 230 231 """ 232 get specific counter metric from pipeline result 233 234 Args: 235 result: the PipelineResult which metrics are read from 236 namespace: a string representing the namespace of wanted metric 237 name: a string representing the name of the wanted metric 238 239 Returns: 240 the result of the wanted metric if it exist, else -1 241 """ 242 metrics = result.metrics().query( 243 MetricsFilter().with_namespace(namespace).with_name(name)) 244 counters = metrics['counters'] 245 if len(counters) > 1: 246 raise RuntimeError( 247 '%d instead of one metric result matches name: %s in namespace %s' % 248 (len(counters), name, namespace)) 249 return counters[0].result if len(counters) > 0 else -1 250 251 252 def get_start_time_metric(result, namespace, name): 253 # type: (PipelineResult, str, str) -> int 254 255 """ 256 get the start time out of all times recorded by the specified distribution 257 metric 258 259 Args: 260 result: the PipelineResult which metrics are read from 261 namespace: a string representing the namespace of wanted metric 262 name: a string representing the name of the wanted metric 263 264 Returns: 265 the smallest time in the metric or -1 if it doesn't exist 266 """ 267 distributions = result.metrics().query( 268 MetricsFilter().with_namespace(namespace).with_name( 269 name))['distributions'] 270 min_list = list(map(lambda m: m.result.min, distributions)) 271 return min(min_list) if len(min_list) > 0 else -1 272 273 274 def get_end_time_metric(result, namespace, name): 275 # type: (PipelineResult, str, str) -> int 276 277 """ 278 get the end time out of all times recorded by the specified distribution 279 metric 280 281 Args: 282 result: the PipelineResult which metrics are read from 283 namespace: a string representing the namespace of wanted metric 284 name: a string representing the name of the wanted metric 285 286 Returns: 287 the largest time in the metric or -1 if it doesn't exist 288 """ 289 distributions = result.metrics().query( 290 MetricsFilter().with_namespace(namespace).with_name( 291 name))['distributions'] 292 max_list = list(map(lambda m: m.result.max, distributions)) 293 return max(max_list) if len(max_list) > 0 else -1