github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/analyzers/perf_analysis_utils.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  import logging
    18  from dataclasses import asdict
    19  from dataclasses import dataclass
    20  from typing import Any
    21  from typing import Dict
    22  from typing import List
    23  from typing import Optional
    24  from typing import Tuple
    25  from typing import Union
    26  
    27  import pandas as pd
    28  import yaml
    29  from google.api_core import exceptions
    30  
    31  from apache_beam.testing.analyzers import constants
    32  from apache_beam.testing.analyzers import github_issues_utils
    33  from apache_beam.testing.load_tests import load_test_metrics_utils
    34  from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsFetcher
    35  from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsPublisher
    36  from signal_processing_algorithms.energy_statistics.energy_statistics import e_divisive
    37  
    38  
    39  @dataclass
    40  class GitHubIssueMetaData:
    41    """
    42    This class holds metadata that needs to be published to the
    43    BigQuery when a GitHub issue is created on a performance
    44    alert.
    45    """
    46    issue_timestamp: pd.Timestamp
    47    change_point_timestamp: pd.Timestamp
    48    test_name: str
    49    metric_name: str
    50    issue_number: int
    51    issue_url: str
    52    test_id: str
    53    change_point: float
    54  
    55  
    56  def is_change_point_in_valid_window(
    57      num_runs_in_change_point_window: int, latest_change_point_run: int) -> bool:
    58    return num_runs_in_change_point_window > latest_change_point_run
    59  
    60  
    61  def get_existing_issues_data(
    62      table_name: str, big_query_metrics_fetcher: BigQueryMetricsFetcher
    63  ) -> Optional[pd.DataFrame]:
    64    """
    65    Finds the most recent GitHub issue created for the test_name.
    66    If no table found with name=test_name, return (None, None)
    67    else return latest created issue_number along with
    68    """
    69    query = f"""
    70    SELECT * FROM {constants._BQ_PROJECT_NAME}.{constants._BQ_DATASET}.{table_name}
    71    ORDER BY {constants._ISSUE_CREATION_TIMESTAMP_LABEL} DESC
    72    LIMIT 10
    73    """
    74    try:
    75      df = big_query_metrics_fetcher.fetch(query=query)
    76    except exceptions.NotFound:
    77      # If no table found, that means this is first performance regression
    78      # on the current test+metric.
    79      return None
    80    return df
    81  
    82  
    83  def is_perf_alert(
    84      previous_change_point_timestamps: List[pd.Timestamp],
    85      change_point_index: int,
    86      timestamps: List[pd.Timestamp],
    87      min_runs_between_change_points: int) -> bool:
    88    """
    89    Search the previous_change_point_timestamps with current observed
    90    change point sibling window and determine if it is a duplicate
    91    change point or not.
    92    timestamps are expected to be in ascending order.
    93  
    94    Return False if the current observed change point is a duplicate of
    95    already reported change points else return True.
    96    """
    97    sibling_change_point_min_timestamp = timestamps[max(
    98        0, change_point_index - min_runs_between_change_points)]
    99    sibling_change_point_max_timestamp = timestamps[min(
   100        change_point_index + min_runs_between_change_points, len(timestamps) - 1)]
   101    # Search a list of previous change point timestamps and compare it with
   102    # current change point timestamp. We do this in case, if a current change
   103    # point is already reported in the past.
   104    for previous_change_point_timestamp in previous_change_point_timestamps:
   105      if (sibling_change_point_min_timestamp <= previous_change_point_timestamp <=
   106          sibling_change_point_max_timestamp):
   107        return False
   108    return True
   109  
   110  
   111  def read_test_config(config_file_path: str) -> Dict:
   112    """
   113    Reads the config file in which the data required to
   114    run the change point analysis is specified.
   115    """
   116    with open(config_file_path, 'r') as stream:
   117      config = yaml.safe_load(stream)
   118    return config
   119  
   120  
   121  def validate_config(keys):
   122    return constants._PERF_TEST_KEYS.issubset(keys)
   123  
   124  
   125  def fetch_metric_data(
   126      params: Dict[str, Any], big_query_metrics_fetcher: BigQueryMetricsFetcher
   127  ) -> Tuple[List[Union[int, float]], List[pd.Timestamp]]:
   128    """
   129    Args:
   130     params: Dict containing keys required to fetch data from a data source.
   131     big_query_metrics_fetcher: A BigQuery metrics fetcher for fetch metrics.
   132    Returns:
   133      Tuple[List[Union[int, float]], List[pd.Timestamp]]: Tuple containing list
   134      of metric_values and list of timestamps. Both are sorted in ascending
   135      order wrt timestamps.
   136    """
   137    query = f"""
   138        SELECT *
   139        FROM {params['project']}.{params['metrics_dataset']}.{params['metrics_table']}
   140        WHERE CONTAINS_SUBSTR(({load_test_metrics_utils.METRICS_TYPE_LABEL}), '{params['metric_name']}')
   141        ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC
   142        LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS}
   143      """
   144    metric_data: pd.DataFrame = big_query_metrics_fetcher.fetch(query=query)
   145    metric_data.sort_values(
   146        by=[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL], inplace=True)
   147    return (
   148        metric_data[load_test_metrics_utils.VALUE_LABEL].tolist(),
   149        metric_data[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL].tolist())
   150  
   151  
   152  def find_latest_change_point_index(metric_values: List[Union[float, int]]):
   153    """
   154    Args:
   155     metric_values: Metric values used to run change point analysis.
   156    Returns:
   157     int: Right most change point index observed on metric_values.
   158    """
   159    change_points_idx = e_divisive(metric_values)
   160    if not change_points_idx:
   161      return None
   162    # Consider the latest change point.
   163    change_points_idx.sort()
   164    return change_points_idx[-1]
   165  
   166  
   167  def publish_issue_metadata_to_big_query(issue_metadata, table_name):
   168    """
   169    Published issue_metadata to BigQuery with table name=test_name.
   170    """
   171    bq_metrics_publisher = BigQueryMetricsPublisher(
   172        project_name=constants._BQ_PROJECT_NAME,
   173        dataset=constants._BQ_DATASET,
   174        table=table_name,
   175        bq_schema=constants._SCHEMA)
   176    bq_metrics_publisher.publish([asdict(issue_metadata)])
   177    logging.info(
   178        'GitHub metadata is published to Big Query Dataset %s'
   179        ', table %s' % (constants._BQ_DATASET, table_name))
   180  
   181  
   182  def create_performance_alert(
   183      metric_name: str,
   184      test_name: str,
   185      timestamps: List[pd.Timestamp],
   186      metric_values: List[Union[int, float]],
   187      change_point_index: int,
   188      labels: List[str],
   189      existing_issue_number: Optional[int],
   190      test_target: Optional[str] = None) -> Tuple[int, str]:
   191    """
   192    Creates performance alert on GitHub issues and returns GitHub issue
   193    number and issue URL.
   194    """
   195    description = github_issues_utils.get_issue_description(
   196        test_name=(
   197            test_name if not test_target else test_name + ':' + test_target),
   198        metric_name=metric_name,
   199        timestamps=timestamps,
   200        metric_values=metric_values,
   201        change_point_index=change_point_index,
   202        max_results_to_display=(
   203            constants._NUM_RESULTS_TO_DISPLAY_ON_ISSUE_DESCRIPTION))
   204  
   205    issue_number, issue_url = github_issues_utils.report_change_point_on_issues(
   206          title=github_issues_utils._ISSUE_TITLE_TEMPLATE.format(
   207            test_name, metric_name
   208          ),
   209          description=description,
   210          labels=labels,
   211          existing_issue_number=existing_issue_number)
   212  
   213    logging.info(
   214        'Performance regression/improvement is alerted on issue #%s. Link '
   215        ': %s' % (issue_number, issue_url))
   216    return issue_number, issue_url