github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/queries/query4.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Query 4, 'Average Price for a Category'. Select the average of the wining bid
    20  prices for all closed auctions in each category. In CQL syntax::
    21  
    22    SELECT Istream(AVG(Q.final))
    23    FROM Category C, (SELECT Rstream(MAX(B.price) AS final, A.category)
    24      FROM Auction A [ROWS UNBOUNDED], Bid B [ROWS UNBOUNDED]
    25      WHERE A.id=B.auction
    26        AND B.datetime < A.expires AND A.expires < CURRENT_TIME
    27      GROUP BY A.id, A.category) Q
    28    WHERE Q.category = C.id
    29    GROUP BY C.id;
    30  
    31  For extra spiciness our implementation differs slightly from the above:
    32  
    33  * We select both the average winning price and the category.
    34  * We don't bother joining with a static category table, since it's
    35    contents are never used.
    36  * We only consider bids which are above the auction's reserve price.
    37  * We accept the highest-price, earliest valid bid as the winner.
    38  * We calculate the averages oven a sliding window of size
    39    window_size_sec and period window_period_sec.
    40  """
    41  
    42  import apache_beam as beam
    43  from apache_beam.testing.benchmarks.nexmark.queries import nexmark_query_util
    44  from apache_beam.testing.benchmarks.nexmark.queries import winning_bids
    45  from apache_beam.testing.benchmarks.nexmark.queries.nexmark_query_util import ResultNames
    46  from apache_beam.transforms import window
    47  
    48  
    49  def load(events, metadata=None, pipeline_options=None):
    50    # find winning bids for each closed auction
    51    all_winning_bids = (
    52        events
    53        | beam.Filter(nexmark_query_util.auction_or_bid)
    54        | winning_bids.WinningBids())
    55    return (
    56        all_winning_bids
    57        # key winning bids by auction category
    58        | beam.Map(lambda auc_bid: (auc_bid.auction.category, auc_bid.bid.price))
    59        # re-window for sliding average
    60        | beam.WindowInto(
    61            window.SlidingWindows(
    62                metadata.get('window_size_sec'),
    63                metadata.get('window_period_sec')))
    64        # average for each category
    65        | beam.CombinePerKey(beam.combiners.MeanCombineFn())
    66        # TODO(leiyiz): fanout with sliding window produces duplicated results,
    67        #   uncomment after it is fixed
    68        #   [https://github.com/apache/beam/issues/20528]
    69        # .with_hot_key_fanout(metadata.get('fanout'))
    70        # produce output
    71        | beam.ParDo(ProjectToCategoryPriceFn()))
    72  
    73  
    74  class ProjectToCategoryPriceFn(beam.DoFn):
    75    def process(self, element, pane_info=beam.DoFn.PaneInfoParam):
    76      yield {
    77          ResultNames.CATEGORY: element[0],
    78          ResultNames.PRICE: element[1],
    79          ResultNames.IS_LAST: pane_info.is_last
    80      }