github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/analyzers/perf_analysis_utils.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 import logging 18 from dataclasses import asdict 19 from dataclasses import dataclass 20 from typing import Any 21 from typing import Dict 22 from typing import List 23 from typing import Optional 24 from typing import Tuple 25 from typing import Union 26 27 import pandas as pd 28 import yaml 29 from google.api_core import exceptions 30 31 from apache_beam.testing.analyzers import constants 32 from apache_beam.testing.analyzers import github_issues_utils 33 from apache_beam.testing.load_tests import load_test_metrics_utils 34 from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsFetcher 35 from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsPublisher 36 from signal_processing_algorithms.energy_statistics.energy_statistics import e_divisive 37 38 39 @dataclass 40 class GitHubIssueMetaData: 41 """ 42 This class holds metadata that needs to be published to the 43 BigQuery when a GitHub issue is created on a performance 44 alert. 45 """ 46 issue_timestamp: pd.Timestamp 47 change_point_timestamp: pd.Timestamp 48 test_name: str 49 metric_name: str 50 issue_number: int 51 issue_url: str 52 test_id: str 53 change_point: float 54 55 56 def is_change_point_in_valid_window( 57 num_runs_in_change_point_window: int, latest_change_point_run: int) -> bool: 58 return num_runs_in_change_point_window > latest_change_point_run 59 60 61 def get_existing_issues_data( 62 table_name: str, big_query_metrics_fetcher: BigQueryMetricsFetcher 63 ) -> Optional[pd.DataFrame]: 64 """ 65 Finds the most recent GitHub issue created for the test_name. 66 If no table found with name=test_name, return (None, None) 67 else return latest created issue_number along with 68 """ 69 query = f""" 70 SELECT * FROM {constants._BQ_PROJECT_NAME}.{constants._BQ_DATASET}.{table_name} 71 ORDER BY {constants._ISSUE_CREATION_TIMESTAMP_LABEL} DESC 72 LIMIT 10 73 """ 74 try: 75 df = big_query_metrics_fetcher.fetch(query=query) 76 except exceptions.NotFound: 77 # If no table found, that means this is first performance regression 78 # on the current test+metric. 79 return None 80 return df 81 82 83 def is_perf_alert( 84 previous_change_point_timestamps: List[pd.Timestamp], 85 change_point_index: int, 86 timestamps: List[pd.Timestamp], 87 min_runs_between_change_points: int) -> bool: 88 """ 89 Search the previous_change_point_timestamps with current observed 90 change point sibling window and determine if it is a duplicate 91 change point or not. 92 timestamps are expected to be in ascending order. 93 94 Return False if the current observed change point is a duplicate of 95 already reported change points else return True. 96 """ 97 sibling_change_point_min_timestamp = timestamps[max( 98 0, change_point_index - min_runs_between_change_points)] 99 sibling_change_point_max_timestamp = timestamps[min( 100 change_point_index + min_runs_between_change_points, len(timestamps) - 1)] 101 # Search a list of previous change point timestamps and compare it with 102 # current change point timestamp. We do this in case, if a current change 103 # point is already reported in the past. 104 for previous_change_point_timestamp in previous_change_point_timestamps: 105 if (sibling_change_point_min_timestamp <= previous_change_point_timestamp <= 106 sibling_change_point_max_timestamp): 107 return False 108 return True 109 110 111 def read_test_config(config_file_path: str) -> Dict: 112 """ 113 Reads the config file in which the data required to 114 run the change point analysis is specified. 115 """ 116 with open(config_file_path, 'r') as stream: 117 config = yaml.safe_load(stream) 118 return config 119 120 121 def validate_config(keys): 122 return constants._PERF_TEST_KEYS.issubset(keys) 123 124 125 def fetch_metric_data( 126 params: Dict[str, Any], big_query_metrics_fetcher: BigQueryMetricsFetcher 127 ) -> Tuple[List[Union[int, float]], List[pd.Timestamp]]: 128 """ 129 Args: 130 params: Dict containing keys required to fetch data from a data source. 131 big_query_metrics_fetcher: A BigQuery metrics fetcher for fetch metrics. 132 Returns: 133 Tuple[List[Union[int, float]], List[pd.Timestamp]]: Tuple containing list 134 of metric_values and list of timestamps. Both are sorted in ascending 135 order wrt timestamps. 136 """ 137 query = f""" 138 SELECT * 139 FROM {params['project']}.{params['metrics_dataset']}.{params['metrics_table']} 140 WHERE CONTAINS_SUBSTR(({load_test_metrics_utils.METRICS_TYPE_LABEL}), '{params['metric_name']}') 141 ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC 142 LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} 143 """ 144 metric_data: pd.DataFrame = big_query_metrics_fetcher.fetch(query=query) 145 metric_data.sort_values( 146 by=[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL], inplace=True) 147 return ( 148 metric_data[load_test_metrics_utils.VALUE_LABEL].tolist(), 149 metric_data[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL].tolist()) 150 151 152 def find_latest_change_point_index(metric_values: List[Union[float, int]]): 153 """ 154 Args: 155 metric_values: Metric values used to run change point analysis. 156 Returns: 157 int: Right most change point index observed on metric_values. 158 """ 159 change_points_idx = e_divisive(metric_values) 160 if not change_points_idx: 161 return None 162 # Consider the latest change point. 163 change_points_idx.sort() 164 return change_points_idx[-1] 165 166 167 def publish_issue_metadata_to_big_query(issue_metadata, table_name): 168 """ 169 Published issue_metadata to BigQuery with table name=test_name. 170 """ 171 bq_metrics_publisher = BigQueryMetricsPublisher( 172 project_name=constants._BQ_PROJECT_NAME, 173 dataset=constants._BQ_DATASET, 174 table=table_name, 175 bq_schema=constants._SCHEMA) 176 bq_metrics_publisher.publish([asdict(issue_metadata)]) 177 logging.info( 178 'GitHub metadata is published to Big Query Dataset %s' 179 ', table %s' % (constants._BQ_DATASET, table_name)) 180 181 182 def create_performance_alert( 183 metric_name: str, 184 test_name: str, 185 timestamps: List[pd.Timestamp], 186 metric_values: List[Union[int, float]], 187 change_point_index: int, 188 labels: List[str], 189 existing_issue_number: Optional[int], 190 test_target: Optional[str] = None) -> Tuple[int, str]: 191 """ 192 Creates performance alert on GitHub issues and returns GitHub issue 193 number and issue URL. 194 """ 195 description = github_issues_utils.get_issue_description( 196 test_name=( 197 test_name if not test_target else test_name + ':' + test_target), 198 metric_name=metric_name, 199 timestamps=timestamps, 200 metric_values=metric_values, 201 change_point_index=change_point_index, 202 max_results_to_display=( 203 constants._NUM_RESULTS_TO_DISPLAY_ON_ISSUE_DESCRIPTION)) 204 205 issue_number, issue_url = github_issues_utils.report_change_point_on_issues( 206 title=github_issues_utils._ISSUE_TITLE_TEMPLATE.format( 207 test_name, metric_name 208 ), 209 description=description, 210 labels=labels, 211 existing_issue_number=existing_issue_number) 212 213 logging.info( 214 'Performance regression/improvement is alerted on issue #%s. Link ' 215 ': %s' % (issue_number, issue_url)) 216 return issue_number, issue_url