github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/github/classifier.py (about) 1 # Copyright 2016 The Kubernetes Authors. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 import datetime 16 import logging 17 import re 18 19 import google.appengine.ext.ndb as ndb 20 21 import models 22 23 24 XREF_RE = re.compile(r'(?:k8s-gubernator\.appspot\.com|gubernator\.k8s\.io)/build(/[^])\s]+/\d+)') 25 APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->') 26 27 28 def classify_issue(repo, number): 29 """ 30 Classify an issue in a repo based on events in Datastore. 31 32 Args: 33 repo: string 34 number: int 35 Returns: 36 is_pr: bool 37 is_open: bool 38 involved: list of strings representing usernames involved 39 payload: a dict, see full description for classify below. 40 last_event_timestamp: the timestamp of the most recent event. 41 """ 42 ancestor = models.GithubResource.make_key(repo, number) 43 logging.info('finding webhooks for %s %s', repo, number) 44 event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor) 45 .order(models.GithubWebhookRaw.timestamp) 46 .fetch(keys_only=True)) 47 48 logging.info('classifying %s %s (%d events)', repo, number, len(event_keys)) 49 last_event_timestamp = [datetime.datetime(2000, 1, 1)] 50 51 def events_iterator(): 52 for x in xrange(0, len(event_keys), 100): 53 events = ndb.get_multi(event_keys[x:x+100]) 54 for event in events: 55 last_event_timestamp[0] = max(last_event_timestamp[0], event.timestamp) 56 yield [event.to_tuple() for event in events] 57 58 def get_status_for(sha): 59 statuses = {} 60 for status in models.GHStatus.query_for_sha(repo, sha): 61 last_event_timestamp[0] = max(last_event_timestamp[0], status.updated_at) 62 statuses[status.context] = [ 63 status.state, status.target_url, status.description] 64 return statuses 65 66 classified = classify_from_iterator(events_iterator(), status_fetcher=get_status_for) 67 return list(classified) + last_event_timestamp 68 69 70 def get_merged(events, merged=None): 71 """ 72 Determine the most up-to-date view of the issue given its inclusion 73 in a series of events. 74 75 Note that different events have different levels of detail-- comments 76 don't include head SHA information, pull request events don't have label 77 information, etc. 78 79 Args: 80 events: a list of (event_type str, event_body dict, timestamp). 81 merged: the result of a previous invocation. 82 Returns: 83 body: a dict representing the issue's latest state. 84 """ 85 merged = merged or {} 86 for _event, body, _timestamp in events: 87 if 'issue' in body: 88 merged.update(body['issue']) 89 if 'pull_request' in body: 90 merged.update(body['pull_request']) 91 return merged 92 93 94 def get_labels(events, labels=None): 95 """ 96 Determine the labels applied to an issue. 97 98 Args: 99 events: a list of (event_type str, event_body dict, timestamp). 100 Returns: 101 labels: the currently applied labels as {label_name: label_color} 102 """ 103 labels = labels or {} 104 for event, body, _timestamp in events: 105 if 'issue' in body: 106 # issues come with labels, so we can update here 107 labels = {l['name']: l['color'] for l in body['issue']['labels']} 108 # pull_requests don't include their full labels :( 109 action = body.get('action') 110 if event == 'pull_request': 111 # Pull request label events don't come with a full label set. 112 # Track them explicitly here. 113 try: 114 if action in ('labeled', 'unlabeled') and 'label' not in body: 115 logging.warning('label event with no labels (multiple changes?)') 116 elif action == 'labeled': 117 label = body['label'] 118 if label['name'] not in labels: 119 labels[label['name']] = label['color'] 120 elif action == 'unlabeled': 121 labels.pop(body['label']['name'], None) 122 except: 123 logging.exception('??? %r', body) 124 raise 125 return labels 126 127 128 def get_skip_comments(events, skip_users=None): 129 """ 130 Determine comment ids that should be ignored, either because of 131 deletion or because the user should be skipped. 132 133 Args: 134 events: a list of (event_type str, event_body dict, timestamp). 135 Returns: 136 comment_ids: a set of comment ids that were deleted or made by 137 users that should be skipped. 138 """ 139 skip_users = skip_users or [] 140 skip_comments = set() 141 for event, body, _timestamp in events: 142 action = body.get('action') 143 if event in ('issue_comment', 'pull_request_review_comment'): 144 comment_id = body['comment']['id'] 145 if action == 'deleted' or body['sender']['login'] in skip_users: 146 skip_comments.add(comment_id) 147 return skip_comments 148 149 def classify(events, status_fetcher=None): 150 """ 151 Given an event-stream for an issue and status-getter, process 152 the events and determine what action should be taken, if any. 153 154 Args: One of: 155 events: a list of (event_type str, event_body dict, timestamp). 156 events_iterator: an iterable yielding successive events lists 157 status_fetcher: a function that returns statuses for the given SHA. 158 Returns: 159 is_pr: bool 160 is_open: bool 161 involved: list of strings representing usernames involved 162 payload: a dictionary of additional information, including: 163 { 164 'author': str author_name, 165 'title': str issue title, 166 'labels': {label_name: label_color}, 167 'attn': {user_name: reason}, 168 'mergeable': bool, 169 'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}], 170 'xrefs': list of builds referenced (by GCS path), 171 } 172 """ 173 merged = get_merged(events) 174 labels = get_labels(events) 175 comments = get_comments(events) 176 reviewers = get_reviewers(events) 177 distilled_events = distill_events(events) 178 179 return _classify_internal( 180 merged, labels, comments, reviewers, distilled_events, status_fetcher) 181 182 183 def classify_from_iterator(events_iterator, status_fetcher=None): 184 """Like classify(), but process batches of events from an iterator.""" 185 merged = None 186 labels = None 187 comments = None 188 reviewers = None 189 distilled_events = None 190 191 for events in events_iterator: 192 merged = get_merged(events, merged) 193 labels = get_labels(events, labels) 194 comments = get_comments(events, comments) 195 reviewers = get_reviewers(events, reviewers) 196 distilled_events = distill_events(events, distilled_events) 197 198 return _classify_internal( 199 merged, labels, comments, reviewers, distilled_events, status_fetcher) 200 201 202 def _classify_internal(merged, labels, comments, reviewers, distilled_events, status_fetcher): 203 approvers = get_approvers(comments) 204 205 is_pr = 'head' in merged or 'pull_request' in merged 206 is_open = merged['state'] != 'closed' 207 author = merged['user']['login'] 208 assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers) 209 involved = sorted(u.lower() for u in set([author] + assignees + approvers)) 210 211 payload = { 212 'author': author, 213 'assignees': assignees, 214 'title': merged['title'], 215 'labels': labels, 216 'xrefs': get_xrefs(comments, merged), 217 } 218 219 if is_pr: 220 if is_open: 221 payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false' 222 payload['additions'] = merged.get('additions', 0) 223 payload['deletions'] = merged.get('deletions', 0) 224 if 'head' in merged: 225 payload['head'] = merged['head']['sha'] 226 227 if approvers: 228 payload['approvers'] = approvers 229 230 if status_fetcher and 'head' in payload: 231 payload['status'] = status_fetcher(payload['head']) 232 233 if merged.get('milestone'): 234 payload['milestone'] = merged['milestone']['title'] 235 236 payload['attn'] = calculate_attention(distilled_events, payload) 237 238 return is_pr, is_open, involved, payload 239 240 241 def get_xrefs(comments, merged): 242 xrefs = set(XREF_RE.findall(merged.get('body') or '')) 243 for c in comments: 244 xrefs.update(XREF_RE.findall(c['comment'])) 245 return sorted(xrefs) 246 247 248 def get_comments(events, comments=None): 249 """ 250 Pick comments and pull-request review comments out of a list of events. 251 Args: 252 events: a list of (event_type str, event_body dict, timestamp). 253 comments_prev: the previous output of this function. 254 Returns: 255 comments: a list of dict(author=..., comment=..., timestamp=...), 256 ordered with the earliest comment first. 257 """ 258 if not comments: 259 comments = {} 260 else: 261 comments = {c['id']: c for c in comments} 262 comments = {} # comment_id : comment 263 for event, body, _timestamp in events: 264 action = body.get('action') 265 if event in ('issue_comment', 'pull_request_review_comment'): 266 comment_id = body['comment']['id'] 267 if action == 'deleted': 268 comments.pop(comment_id, None) 269 else: 270 c = body['comment'] 271 comments[comment_id] = { 272 'author': c['user']['login'], 273 'comment': c['body'], 274 'timestamp': c['created_at'], 275 'id': c['id'], 276 } 277 return sorted(comments.values(), key=lambda c: c['timestamp']) 278 279 280 def get_reviewers(events, reviewers=None): 281 """ 282 Return the set of users that have a code review requested or completed. 283 """ 284 reviewers = reviewers or set() 285 for event, body, _timestamp in events: 286 action = body.get('action') 287 if event == 'pull_request': 288 if action == 'review_requested': 289 if 'requested_reviewer' not in body: 290 logging.warning('no reviewer present -- self-review?') 291 continue 292 reviewers.add(body['requested_reviewer']['login']) 293 elif action == 'review_request_removed': 294 reviewers -= {body['requested_reviewer']['login']} 295 elif event == 'pull_request_review': 296 if action == 'submitted': 297 reviewers.add(body['sender']['login']) 298 299 return reviewers 300 301 302 def get_approvers(comments): 303 """ 304 Return approvers requested in comments. 305 306 This MUST be kept in sync with mungegithub's getGubernatorMetadata(). 307 """ 308 approvers = [] 309 for comment in comments: 310 if comment['author'] == 'k8s-merge-robot': 311 m = APPROVERS_RE.search(comment['comment']) 312 if m: 313 approvers = m.group(1).replace('"', '').split(',') 314 return approvers 315 316 317 def distill_events(events, distilled_events=None): 318 """ 319 Given a sequence of events, return a series of user-action tuples 320 relevant to determining user state. 321 """ 322 bots = [ 323 'k8s-bot', 324 'k8s-ci-robot', 325 'k8s-merge-robot', 326 'k8s-oncall', 327 'k8s-reviewable', 328 ] 329 skip_comments = get_skip_comments(events, bots) 330 331 output = distilled_events or [] 332 for event, body, timestamp in events: 333 action = body.get('action') 334 user = body.get('sender', {}).get('login') 335 if event in ('issue_comment', 'pull_request_review_comment'): 336 if body['comment']['id'] in skip_comments: 337 continue 338 if action == 'created': 339 output.append(('comment', user, timestamp)) 340 if event == 'pull_request_review': 341 if action == 'submitted': 342 # this is morally equivalent to a comment 343 output.append(('comment', user, timestamp)) 344 if event == 'pull_request': 345 if action in ('opened', 'reopened', 'synchronize'): 346 output.append(('push', user, timestamp)) 347 if action == 'labeled' and 'label' in body: 348 output.append(('label ' + body['label']['name'].lower(), user, timestamp)) 349 return output 350 351 352 def evaluate_fsm(events, start, transitions): 353 """ 354 Given a series of event tuples and a start state, execute the list of transitions 355 and return the resulting state, the time it entered that state, and the last time 356 the state would be entered (self-transitions are allowed). 357 358 transitions is a list of tuples 359 (state_before str, state_after str, condition str or callable) 360 361 The transition occurs if condition equals the action (as a str), or if 362 condition(action, user) is True. 363 """ 364 state = start 365 state_start = 0 # time that we entered this state 366 state_last = 0 # time of last transition into this state 367 for action, user, timestamp in events: 368 for state_before, state_after, condition in transitions: 369 if state_before is None or state_before == state: 370 if condition == action or (callable(condition) and condition(action, user)): 371 if state_after != state: 372 state_start = timestamp 373 state = state_after 374 state_last = timestamp 375 break 376 return state, state_start, state_last 377 378 379 def get_author_state(author, distilled_events): 380 """ 381 Determine the state of the author given a series of distilled events. 382 """ 383 return evaluate_fsm(distilled_events, start='waiting', transitions=[ 384 # before, after, condition 385 (None, 'address comments', lambda a, u: a == 'comment' and u != author), 386 ('address comments', 'waiting', 'push'), 387 ('address comments', 'waiting', lambda a, u: a == 'comment' and u == author), 388 ]) 389 390 391 def get_assignee_state(assignee, author, distilled_events): 392 """ 393 Determine the state of an assignee given a series of distilled events. 394 """ 395 return evaluate_fsm(distilled_events, start='needs review', transitions=[ 396 # before, after, condition 397 ('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')), 398 (None, 'needs review', 'push'), 399 (None, 'needs review', lambda a, u: a == 'comment' and u == author), 400 ]) 401 402 403 def calculate_attention(distilled_events, payload): 404 """ 405 Given information about an issue, determine who should look at it. 406 407 It can include start and last update time for various states -- 408 "address comments#123#456" means that something has been in 'address comments' since 409 123, and there was some other event that put it in 'address comments' at 456. 410 """ 411 author = payload['author'] 412 assignees = payload['assignees'] 413 414 attn = {} 415 def notify(to, reason): 416 attn[to] = reason 417 418 if any(state == 'failure' for state, _url, _desc 419 in payload.get('status', {}).values()): 420 notify(author, 'fix tests') 421 422 for approver in payload.get('approvers', []): 423 notify(approver, 'needs approval') 424 425 for assignee in assignees: 426 assignee_state, first, last = get_assignee_state(assignee, author, distilled_events) 427 if assignee_state != 'waiting': 428 notify(assignee, '%s#%s#%s' % (assignee_state, first, last)) 429 430 author_state, first, last = get_author_state(author, distilled_events) 431 if author_state != 'waiting': 432 notify(author, '%s#%s#%s' % (author_state, first, last)) 433 434 if payload.get('needs_rebase'): 435 notify(author, 'needs rebase') 436 if 'do-not-merge/release-note-label-needed' in payload['labels']: 437 notify(author, 'needs release-note label') 438 439 return attn