github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/nexmark_util.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Utilities for the Nexmark suite.
    19  
    20  The Nexmark suite is a series of queries (streaming pipelines) performed
    21  on a simulation of auction events. This util includes:
    22  
    23    - A Command class used to terminate the streaming jobs
    24      launched in nexmark_launcher.py by the DirectRunner.
    25    - A ParseEventFn DoFn to parse events received from PubSub.
    26  
    27  Usage:
    28  
    29  To run a process for a certain duration, define in the code:
    30    command = Command(process_to_terminate, args)
    31    command.run(timeout=duration)
    32  
    33  """
    34  
    35  # pytype: skip-file
    36  
    37  import json
    38  import logging
    39  import threading
    40  
    41  import apache_beam as beam
    42  from apache_beam.metrics import MetricsFilter
    43  from apache_beam.runners.runner import PipelineResult  # pylint: disable=unused-import
    44  from apache_beam.testing.benchmarks.nexmark.models import auction_bid
    45  from apache_beam.testing.benchmarks.nexmark.models import nexmark_model
    46  from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames
    47  from apache_beam.transforms import window
    48  from apache_beam.utils.timestamp import Timestamp
    49  
    50  _LOGGER = logging.getLogger(__name__)
    51  
    52  
    53  class Command(object):
    54    def __init__(self, cmd, args):
    55      self.cmd = cmd
    56      self.args = args
    57  
    58    def run(self, timeout):
    59      def thread_target():
    60        logging.debug(
    61            'Starting thread for %d seconds: %s', timeout, self.cmd.__name__)
    62  
    63        self.cmd(*self.args)
    64        _LOGGER.info(
    65            '%d seconds elapsed. Thread (%s) finished.',
    66            timeout,
    67            self.cmd.__name__)
    68  
    69      thread = threading.Thread(target=thread_target, name='Thread-timeout')
    70      thread.daemon = True
    71      thread.start()
    72      thread.join(timeout)
    73  
    74  
    75  def setup_coder():
    76    beam.coders.registry.register_coder(
    77        nexmark_model.Auction, nexmark_model.AuctionCoder)
    78    beam.coders.registry.register_coder(
    79        nexmark_model.Person, nexmark_model.PersonCoder)
    80    beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder)
    81    beam.coders.registry.register_coder(
    82        auction_bid.AuctionBid, auction_bid.AuctionBidCoder)
    83  
    84  
    85  class ParseEventFn(beam.DoFn):
    86    """
    87    Original parser for parsing raw events info into a Python objects.
    88  
    89    Each event line has the following format:
    90  
    91      person: <id starting with 'p'>,name,email,credit_card,city, \
    92              state,timestamp,extra
    93      auction: <id starting with 'a'>,item_name, description,initial_bid, \
    94               reserve_price,timestamp,expires,seller,category,extra
    95      bid: <auction starting with 'b'>,bidder,price,timestamp,extra
    96  
    97    For example:
    98  
    99      'p12345,maria,maria@maria.com,1234-5678-9012-3456, \
   100       sunnyvale,CA,1528098831536'
   101      'a12345,car67,2012 hyundai elantra,15000,20000, \
   102       1528098831536,20180630,maria,vehicle'
   103      'b12345,maria,20000,1528098831536'
   104    """
   105    def process(self, elem):
   106      model_dict = {
   107          'p': nexmark_model.Person,
   108          'a': nexmark_model.Auction,
   109          'b': nexmark_model.Bid,
   110      }
   111      row = elem.split(',')
   112      model = model_dict.get(elem[0])
   113      if not model:
   114        raise ValueError('Invalid event: %s.' % row)
   115  
   116      event = model(*row)
   117      logging.debug('Parsed event: %s', event)
   118      yield event
   119  
   120  
   121  class ParseJsonEventFn(beam.DoFn):
   122    """Parses the raw event info into a Python objects.
   123  
   124    Each event line has the following format:
   125  
   126      person:  {id,name,email,credit_card,city, \
   127                state,timestamp,extra}
   128      auction: {id,item_name, description,initial_bid, \
   129                reserve_price,timestamp,expires,seller,category,extra}
   130      bid:     {auction,bidder,price,timestamp,extra}
   131  
   132    For example:
   133  
   134      {"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\
   135       "creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\
   136       "dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["}
   137  
   138      {"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\
   139       "initialBid":28873,"reserve":29448,"dateTime":1528098831036,\
   140       "expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"}
   141  
   142      {"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\
   143       "extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"}
   144    """
   145    def process(self, elem):
   146      json_dict = json.loads(elem)
   147      if type(json_dict[FieldNames.DATE_TIME]) is dict:
   148        json_dict[FieldNames.DATE_TIME] = json_dict[
   149            FieldNames.DATE_TIME]['millis']
   150      if FieldNames.NAME in json_dict:
   151        yield nexmark_model.Person(
   152            json_dict[FieldNames.ID],
   153            json_dict[FieldNames.NAME],
   154            json_dict[FieldNames.EMAIL_ADDRESS],
   155            json_dict[FieldNames.CREDIT_CARD],
   156            json_dict[FieldNames.CITY],
   157            json_dict[FieldNames.STATE],
   158            millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
   159            json_dict[FieldNames.EXTRA])
   160      elif FieldNames.ITEM_NAME in json_dict:
   161        if type(json_dict[FieldNames.EXPIRES]) is dict:
   162          json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis']
   163        yield nexmark_model.Auction(
   164            json_dict[FieldNames.ID],
   165            json_dict[FieldNames.ITEM_NAME],
   166            json_dict[FieldNames.DESCRIPTION],
   167            json_dict[FieldNames.INITIAL_BID],
   168            json_dict[FieldNames.RESERVE],
   169            millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
   170            millis_to_timestamp(json_dict[FieldNames.EXPIRES]),
   171            json_dict[FieldNames.SELLER],
   172            json_dict[FieldNames.CATEGORY],
   173            json_dict[FieldNames.EXTRA])
   174      elif FieldNames.AUCTION in json_dict:
   175        yield nexmark_model.Bid(
   176            json_dict[FieldNames.AUCTION],
   177            json_dict[FieldNames.BIDDER],
   178            json_dict[FieldNames.PRICE],
   179            millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
   180            json_dict[FieldNames.EXTRA])
   181      else:
   182        raise ValueError('Invalid event: %s.' % str(json_dict))
   183  
   184  
   185  class CountAndLog(beam.PTransform):
   186    def expand(self, pcoll):
   187      return (
   188          pcoll
   189          | 'window' >> beam.WindowInto(window.GlobalWindows())
   190          | "Count" >> beam.combiners.Count.Globally()
   191          | "Log" >> beam.Map(log_count_info))
   192  
   193  
   194  def log_count_info(count):
   195    logging.info('Query resulted in %d results', count)
   196    return count
   197  
   198  
   199  def display(elm):
   200    logging.debug(elm)
   201    return elm
   202  
   203  
   204  def model_to_json(model):
   205    return json.dumps(construct_json_dict(model), separators=(',', ':'))
   206  
   207  
   208  def construct_json_dict(model):
   209    return {k: unnest_to_json(v) for k, v in model.__dict__.items()}
   210  
   211  
   212  def unnest_to_json(cand):
   213    if isinstance(cand, Timestamp):
   214      return cand.micros // 1000
   215    elif isinstance(
   216        cand, (nexmark_model.Auction, nexmark_model.Bid, nexmark_model.Person)):
   217      return construct_json_dict(cand)
   218    else:
   219      return cand
   220  
   221  
   222  def millis_to_timestamp(millis):
   223    # type: (int) -> Timestamp
   224    micro_second = millis * 1000
   225    return Timestamp(micros=micro_second)
   226  
   227  
   228  def get_counter_metric(result, namespace, name):
   229    # type: (PipelineResult, str, str) -> int
   230  
   231    """
   232    get specific counter metric from pipeline result
   233  
   234    Args:
   235      result: the PipelineResult which metrics are read from
   236      namespace: a string representing the namespace of wanted metric
   237      name: a string representing the  name of the wanted metric
   238  
   239    Returns:
   240      the result of the wanted metric if it exist, else -1
   241    """
   242    metrics = result.metrics().query(
   243        MetricsFilter().with_namespace(namespace).with_name(name))
   244    counters = metrics['counters']
   245    if len(counters) > 1:
   246      raise RuntimeError(
   247          '%d instead of one metric result matches name: %s in namespace %s' %
   248          (len(counters), name, namespace))
   249    return counters[0].result if len(counters) > 0 else -1
   250  
   251  
   252  def get_start_time_metric(result, namespace, name):
   253    # type: (PipelineResult, str, str) -> int
   254  
   255    """
   256    get the start time out of all times recorded by the specified distribution
   257    metric
   258  
   259    Args:
   260      result: the PipelineResult which metrics are read from
   261      namespace: a string representing the namespace of wanted metric
   262      name: a string representing the  name of the wanted metric
   263  
   264    Returns:
   265      the smallest time in the metric or -1 if it doesn't exist
   266    """
   267    distributions = result.metrics().query(
   268        MetricsFilter().with_namespace(namespace).with_name(
   269            name))['distributions']
   270    min_list = list(map(lambda m: m.result.min, distributions))
   271    return min(min_list) if len(min_list) > 0 else -1
   272  
   273  
   274  def get_end_time_metric(result, namespace, name):
   275    # type: (PipelineResult, str, str) -> int
   276  
   277    """
   278    get the end time out of all times recorded by the specified distribution
   279    metric
   280  
   281    Args:
   282      result: the PipelineResult which metrics are read from
   283      namespace: a string representing the namespace of wanted metric
   284      name: a string representing the  name of the wanted metric
   285  
   286    Returns:
   287      the largest time in the metric or -1 if it doesn't exist
   288    """
   289    distributions = result.metrics().query(
   290        MetricsFilter().with_namespace(namespace).with_name(
   291            name))['distributions']
   292    max_list = list(map(lambda m: m.result.max, distributions))
   293    return max(max_list) if len(max_list) > 0 else -1