github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/github/classifier.py (about) 1 # Copyright 2016 The Kubernetes Authors. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 import datetime 16 import logging 17 import re 18 19 import google.appengine.ext.ndb as ndb 20 21 import models 22 23 24 XREF_RE = re.compile(r'k8s-gubernator.appspot.com/build(/[^])\s]+/\d+)') 25 APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->') 26 27 28 class Deduper(object): 29 ''' A memory-saving string deduplicator for Python datastructures. 30 31 This is somewhat like the built-in intern() function, but without pinning memory 32 permanently. 33 34 Tries to reduce memory usage by making equivalent strings point at the same object. 35 This reduces memory usage for large, repetitive JSON structures by >2x. 36 ''' 37 38 def __init__(self): 39 self.strings = {} 40 41 def dedup(self, obj): 42 if isinstance(obj, basestring): 43 return self.strings.setdefault(obj, obj) 44 elif isinstance(obj, dict): 45 return {self.dedup(k): self.dedup(v) for k, v in obj.iteritems()} 46 elif isinstance(obj, tuple): 47 return tuple(self.dedup(x) for x in obj) 48 elif isinstance(obj, list): 49 return [self.dedup(x) for x in obj] 50 return obj 51 52 53 def classify_issue(repo, number): 54 ''' 55 Classify an issue in a repo based on events in Datastore. 56 57 Args: 58 repo: string 59 number: int 60 Returns: 61 is_pr: bool 62 is_open: bool 63 involved: list of strings representing usernames involved 64 payload: a dict, see full description for classify below. 65 last_event_timestamp: the timestamp of the most recent event. 66 ''' 67 ancestor = models.GithubResource.make_key(repo, number) 68 logging.debug('finding webhooks for %s %s', repo, number) 69 event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor).fetch(keys_only=True)) 70 71 logging.debug('classifying %s %s (%d events)', repo, number, len(event_keys)) 72 event_tuples = [] 73 last_event_timestamp = datetime.datetime(2000, 1, 1) 74 75 76 if len(event_keys) > 800: 77 logging.warning('too many events. blackholing.') 78 return False, False, [], {'num_events': len(event_keys)}, last_event_timestamp 79 80 deduper = Deduper() 81 82 for x in xrange(0, len(event_keys), 100): 83 events = ndb.get_multi(event_keys[x:x+100]) 84 last_event_timestamp = max(last_event_timestamp, max(e.timestamp for e in events)) 85 event_tuples.extend([deduper.dedup(event.to_tuple()) for event in events]) 86 87 event_tuples.sort(key=lambda x: x[2]) # sort by timestamp 88 89 del deduper # attempt to save memory 90 del events 91 92 merged = get_merged(event_tuples) 93 statuses = None 94 if 'head' in merged: 95 statuses = {} 96 for status in models.GHStatus.query_for_sha(repo, merged['head']['sha']): 97 last_event_timestamp = max(last_event_timestamp, status.updated_at) 98 statuses[status.context] = [ 99 status.state, status.target_url, status.description] 100 101 return list(classify(event_tuples, statuses)) + [last_event_timestamp] 102 103 104 def get_merged(events): 105 ''' 106 Determine the most up-to-date view of the issue given its inclusion 107 in a series of events. 108 109 Note that different events have different levels of detail-- comments 110 don't include head SHA information, pull request events don't have label 111 information, etc. 112 113 Args: 114 events: a list of (event_type str, event_body dict, timestamp). 115 Returns: 116 body: a dict representing the issue's latest state. 117 ''' 118 merged = {} 119 for _event, body, _timestamp in events: 120 if 'issue' in body: 121 merged.update(body['issue']) 122 if 'pull_request' in body: 123 merged.update(body['pull_request']) 124 return merged 125 126 127 def get_labels(events): 128 ''' 129 Determine the labels applied to an issue. 130 131 Args: 132 events: a list of (event_type str, event_body dict, timestamp). 133 Returns: 134 labels: the currently applied labels as {label_name: label_color} 135 ''' 136 labels = [] 137 for event, body, _timestamp in events: 138 if 'issue' in body: 139 # issues come with labels, so we can update here 140 labels = body['issue']['labels'] 141 # pull_requests don't include their full labels :( 142 action = body.get('action') 143 if event == 'pull_request': 144 # Pull request label events don't come with a full label set. 145 # Track them explicitly here. 146 try: 147 if action in ('labeled', 'unlabeled') and 'label' not in body: 148 logging.warning('label event with no labels (multiple changes?)') 149 elif action == 'labeled': 150 label = body['label'] 151 if label not in labels: 152 labels.append(label) 153 elif action == 'unlabeled': 154 label = body['label'] 155 if label in labels: 156 labels.remove(label) 157 except: 158 logging.exception('??? %r', body) 159 raise 160 return {label['name']: label['color'] for label in labels} 161 162 163 def get_skip_comments(events, skip_users=None): 164 ''' 165 Determine comment ids that should be ignored, either because of 166 deletion or because the user should be skipped. 167 168 Args: 169 events: a list of (event_type str, event_body dict, timestamp). 170 Returns: 171 comment_ids: a set of comment ids that were deleted or made by 172 users that should be skiped. 173 ''' 174 if skip_users is None: 175 skip_users = [] 176 177 skip_comments = set() 178 for event, body, _timestamp in events: 179 action = body.get('action') 180 if event in ('issue_comment', 'pull_request_review_comment'): 181 comment_id = body['comment']['id'] 182 if action == 'deleted' or body['sender']['login'] in skip_users: 183 skip_comments.add(comment_id) 184 return skip_comments 185 186 187 def classify(events, statuses=None): 188 ''' 189 Given an event-stream for an issue and status-getter, process 190 the events and determine what action should be taken, if any. 191 192 Args: 193 events: a list of (event_type str, event_body dict, timestamp). 194 Returns: 195 is_pr: bool 196 is_open: bool 197 involved: list of strings representing usernames involved 198 payload: a dictionary of additional information, including: 199 { 200 'author': str author_name, 201 'title': str issue title, 202 'labels': {label_name: label_color}, 203 'attn': {user_name: reason}, 204 'mergeable': bool, 205 'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}], 206 'xrefs': list of builds referenced (by GCS path), 207 } 208 ''' 209 merged = get_merged(events) 210 labels = get_labels(events) 211 comments = get_comments(events) 212 xrefs = get_xrefs(comments, merged) 213 approvers = get_approvers(comments) 214 reviewers = get_reviewers(events) 215 216 is_pr = 'head' in merged or 'pull_request' in merged 217 is_open = merged['state'] != 'closed' 218 author = merged['user']['login'] 219 assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers) 220 involved = sorted(set([author] + assignees + approvers)) 221 222 payload = { 223 'author': author, 224 'assignees': assignees, 225 'title': merged['title'], 226 'labels': labels, 227 'xrefs': xrefs, 228 } 229 230 if is_pr: 231 if is_open: 232 payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false' 233 payload['additions'] = merged.get('additions', 0) 234 payload['deletions'] = merged.get('deletions', 0) 235 if 'head' in merged: 236 payload['head'] = merged['head']['sha'] 237 238 if statuses: 239 payload['status'] = statuses 240 241 if approvers: 242 payload['approvers'] = approvers 243 244 payload['attn'] = calculate_attention(distill_events(events), payload) 245 246 return is_pr, is_open, involved, payload 247 248 249 def get_xrefs(comments, merged): 250 xrefs = set(XREF_RE.findall(merged.get('body') or '')) 251 for c in comments: 252 xrefs.update(XREF_RE.findall(c['comment'])) 253 return sorted(xrefs) 254 255 256 def get_comments(events): 257 ''' 258 Pick comments and pull-request review comments out of a list of events. 259 Args: 260 events: a list of (event_type str, event_body dict, timestamp). 261 Returns: 262 comments: a list of dict(author=..., comment=..., timestamp=...), 263 ordered with the earliest comment first. 264 ''' 265 comments = {} # comment_id : comment 266 for event, body, _timestamp in events: 267 action = body.get('action') 268 if event in ('issue_comment', 'pull_request_review_comment'): 269 comment_id = body['comment']['id'] 270 if action == 'deleted': 271 comments.pop(comment_id, None) 272 else: 273 comments[comment_id] = body['comment'] 274 return [ 275 { 276 'author': c['user']['login'], 277 'comment': c['body'], 278 'timestamp': c['created_at'] 279 } 280 for c in sorted(comments.values(), key=lambda c: c['created_at']) 281 ] 282 283 284 def get_reviewers(events): 285 ''' 286 Return the set of users that have a code review requested or completed. 287 ''' 288 reviewers = set() 289 for event, body, _timestamp in events: 290 action = body.get('action') 291 if event == 'pull_request': 292 if action == 'review_requested': 293 if 'requested_reviewer' not in body: 294 logging.warning('no reviewer present -- self-review?') 295 continue 296 reviewers.add(body['requested_reviewer']['login']) 297 elif action == 'review_request_removed': 298 reviewers -= {body['requested_reviewer']['login']} 299 return reviewers 300 301 302 def get_approvers(comments): 303 ''' 304 Return approvers requested in comments. 305 306 This MUST be kept in sync with mungegithub's getGubernatorMetadata(). 307 ''' 308 approvers = [] 309 for comment in comments: 310 if comment['author'] == 'k8s-merge-robot': 311 m = APPROVERS_RE.search(comment['comment']) 312 if m: 313 approvers = m.group(1).replace('"', '').split(',') 314 return approvers 315 316 317 def distill_events(events): 318 ''' 319 Given a sequence of events, return a series of user-action tuples 320 relevant to determining user state. 321 ''' 322 bots = [ 323 'k8s-bot', 324 'k8s-ci-robot', 325 'k8s-merge-robot', 326 'k8s-oncall', 327 'k8s-reviewable', 328 ] 329 skip_comments = get_skip_comments(events, bots) 330 331 output = [] 332 for event, body, timestamp in events: 333 action = body.get('action') 334 user = body.get('sender', {}).get('login') 335 if event in ('issue_comment', 'pull_request_review_comment'): 336 if body['comment']['id'] in skip_comments: 337 continue 338 if action == 'created': 339 output.append(('comment', user, timestamp)) 340 if event == 'pull_request_review': 341 if action == 'submitted': 342 # this is morally equivalent to a comment 343 output.append(('comment', user, timestamp)) 344 if event == 'pull_request': 345 if action in ('opened', 'reopened', 'synchronize'): 346 output.append(('push', user, timestamp)) 347 if action == 'labeled' and 'label' in body: 348 output.append(('label ' + body['label']['name'].lower(), user, timestamp)) 349 return output 350 351 352 def evaluate_fsm(events, start, transitions): 353 ''' 354 Given a series of event tuples and a start state, execute the list of transitions 355 and return the resulting state, the time it entered that state, and the last time 356 the state would be entered (self-transitions are allowed). 357 358 transitions is a list of tuples 359 (state_before str, state_after str, condition str or callable) 360 361 The transition occurs if condition equals the action (as a str), or if 362 condition(action, user) is True. 363 ''' 364 state = start 365 state_start = 0 # time that we entered this state 366 state_last = 0 # time of last transition into this state 367 for action, user, timestamp in events: 368 for state_before, state_after, condition in transitions: 369 if state_before is None or state_before == state: 370 if condition == action or (callable(condition) and condition(action, user)): 371 if state_after != state: 372 state_start = timestamp 373 state = state_after 374 state_last = timestamp 375 break 376 return state, state_start, state_last 377 378 379 def get_author_state(author, distilled_events): 380 ''' 381 Determine the state of the author given a series of distilled events. 382 ''' 383 return evaluate_fsm(distilled_events, start='waiting', transitions=[ 384 # before, after, condition 385 (None, 'address comments', lambda a, u: a == 'comment' and u != author), 386 ('address comments', 'waiting', 'push'), 387 ('address comments', 'waiting', lambda a, u: a == 'comment' and u == author), 388 ]) 389 390 391 def get_assignee_state(assignee, author, distilled_events): 392 ''' 393 Determine the state of an assignee given a series of distilled events. 394 ''' 395 return evaluate_fsm(distilled_events, start='needs review', transitions=[ 396 # before, after, condition 397 ('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')), 398 (None, 'needs review', 'push'), 399 (None, 'needs review', lambda a, u: a == 'comment' and u == author), 400 ]) 401 402 403 def calculate_attention(distilled_events, payload): 404 ''' 405 Given information about an issue, determine who should look at it. 406 407 It can include start and last update time for various states -- 408 "address comments#123#456" means that something has been in 'address comments' since 409 123, and there was some other event that put it in 'address comments' at 456. 410 ''' 411 author = payload['author'] 412 assignees = payload['assignees'] 413 414 attn = {} 415 def notify(to, reason): 416 attn[to] = reason 417 418 if any(state == 'failure' for state, _url, _desc 419 in payload.get('status', {}).values()): 420 notify(author, 'fix tests') 421 422 for approver in payload.get('approvers', []): 423 notify(approver, 'needs approval') 424 425 for assignee in assignees: 426 assignee_state, first, last = get_assignee_state(assignee, author, distilled_events) 427 if assignee_state != 'waiting': 428 notify(assignee, '%s#%s#%s' % (assignee_state, first, last)) 429 430 author_state, first, last = get_author_state(author, distilled_events) 431 if author_state != 'waiting': 432 notify(author, '%s#%s#%s' % (author_state, first, last)) 433 434 if payload.get('needs_rebase'): 435 notify(author, 'needs rebase') 436 if 'release-note-label-needed' in payload['labels']: 437 notify(author, 'needs release-note label') 438 439 return attn