github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/github/classifier.py

github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/github/classifier.py (about)

     1  # Copyright 2016 The Kubernetes Authors.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #     http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import datetime
    16  import logging
    17  import re
    18  
    19  import google.appengine.ext.ndb as ndb
    20  
    21  import models
    22  
    23  
    24  XREF_RE = re.compile(r'(?:k8s-gubernator\.appspot\.com|gubernator\.k8s\.io)/build(/[^])\s]+/\d+)')
    25  APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->')
    26  
    27  
    28  def classify_issue(repo, number):
    29      """
    30      Classify an issue in a repo based on events in Datastore.
    31  
    32      Args:
    33          repo: string
    34          number: int
    35      Returns:
    36          is_pr: bool
    37          is_open: bool
    38          involved: list of strings representing usernames involved
    39          payload: a dict, see full description for classify below.
    40          last_event_timestamp: the timestamp of the most recent event.
    41      """
    42      ancestor = models.GithubResource.make_key(repo, number)
    43      logging.info('finding webhooks for %s %s', repo, number)
    44      event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor)
    45          .order(models.GithubWebhookRaw.timestamp)
    46          .fetch(keys_only=True))
    47  
    48      logging.info('classifying %s %s (%d events)', repo, number, len(event_keys))
    49      last_event_timestamp = [datetime.datetime(2000, 1, 1)]
    50  
    51      def events_iterator():
    52          for x in xrange(0, len(event_keys), 100):
    53              events = ndb.get_multi(event_keys[x:x+100])
    54              for event in events:
    55                  last_event_timestamp[0] = max(last_event_timestamp[0], event.timestamp)
    56              yield [event.to_tuple() for event in events]
    57  
    58      def get_status_for(sha):
    59          statuses = {}
    60          for status in models.GHStatus.query_for_sha(repo, sha):
    61              last_event_timestamp[0] = max(last_event_timestamp[0], status.updated_at)
    62              statuses[status.context] = [
    63                  status.state, status.target_url, status.description]
    64          return statuses
    65  
    66      classified = classify_from_iterator(events_iterator(), status_fetcher=get_status_for)
    67      return list(classified) + last_event_timestamp
    68  
    69  
    70  def get_merged(events, merged=None):
    71      """
    72      Determine the most up-to-date view of the issue given its inclusion
    73      in a series of events.
    74  
    75      Note that different events have different levels of detail-- comments
    76      don't include head SHA information, pull request events don't have label
    77      information, etc.
    78  
    79      Args:
    80          events: a list of (event_type str, event_body dict, timestamp).
    81          merged: the result of a previous invocation.
    82      Returns:
    83          body: a dict representing the issue's latest state.
    84      """
    85      merged = merged or {}
    86      for _event, body, _timestamp in events:
    87          if 'issue' in body:
    88              merged.update(body['issue'])
    89          if 'pull_request' in body:
    90              merged.update(body['pull_request'])
    91      return merged
    92  
    93  
    94  def get_labels(events, labels=None):
    95      """
    96      Determine the labels applied to an issue.
    97  
    98      Args:
    99          events: a list of (event_type str, event_body dict, timestamp).
   100      Returns:
   101          labels: the currently applied labels as {label_name: label_color}
   102      """
   103      labels = labels or {}
   104      for event, body, _timestamp in events:
   105          if 'issue' in body:
   106              # issues come with labels, so we can update here
   107              labels = {l['name']: l['color'] for l in body['issue']['labels']}
   108          # pull_requests don't include their full labels :(
   109          action = body.get('action')
   110          if event == 'pull_request':
   111              # Pull request label events don't come with a full label set.
   112              # Track them explicitly here.
   113              try:
   114                  if action in ('labeled', 'unlabeled') and 'label' not in body:
   115                      logging.warning('label event with no labels (multiple changes?)')
   116                  elif action == 'labeled':
   117                      label = body['label']
   118                      if label['name'] not in labels:
   119                          labels[label['name']] = label['color']
   120                  elif action == 'unlabeled':
   121                      labels.pop(body['label']['name'], None)
   122              except:
   123                  logging.exception('??? %r', body)
   124                  raise
   125      return labels
   126  
   127  
   128  def get_skip_comments(events, skip_users=None):
   129      """
   130      Determine comment ids that should be ignored, either because of
   131          deletion or because the user should be skipped.
   132  
   133      Args:
   134          events: a list of (event_type str, event_body dict, timestamp).
   135      Returns:
   136          comment_ids: a set of comment ids that were deleted or made by
   137              users that should be skipped.
   138      """
   139      skip_users = skip_users or []
   140      skip_comments = set()
   141      for event, body, _timestamp in events:
   142          action = body.get('action')
   143          if event in ('issue_comment', 'pull_request_review_comment'):
   144              comment_id = body['comment']['id']
   145              if action == 'deleted' or body['sender']['login'] in skip_users:
   146                  skip_comments.add(comment_id)
   147      return skip_comments
   148  
   149  def classify(events, status_fetcher=None):
   150      """
   151      Given an event-stream for an issue and status-getter, process
   152      the events and determine what action should be taken, if any.
   153  
   154      Args: One of:
   155          events: a list of (event_type str, event_body dict, timestamp).
   156          events_iterator: an iterable yielding successive events lists
   157          status_fetcher: a function that returns statuses for the given SHA.
   158      Returns:
   159          is_pr: bool
   160          is_open: bool
   161          involved: list of strings representing usernames involved
   162          payload: a dictionary of additional information, including:
   163              {
   164                  'author': str author_name,
   165                  'title': str issue title,
   166                  'labels': {label_name: label_color},
   167                  'attn': {user_name: reason},
   168                  'mergeable': bool,
   169                  'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}],
   170                  'xrefs': list of builds referenced (by GCS path),
   171              }
   172      """
   173      merged = get_merged(events)
   174      labels = get_labels(events)
   175      comments = get_comments(events)
   176      reviewers = get_reviewers(events)
   177      distilled_events = distill_events(events)
   178  
   179      return _classify_internal(
   180          merged, labels, comments, reviewers, distilled_events, status_fetcher)
   181  
   182  
   183  def classify_from_iterator(events_iterator, status_fetcher=None):
   184      """Like classify(), but process batches of events from an iterator."""
   185      merged = None
   186      labels = None
   187      comments = None
   188      reviewers = None
   189      distilled_events = None
   190  
   191      for events in events_iterator:
   192          merged = get_merged(events, merged)
   193          labels = get_labels(events, labels)
   194          comments = get_comments(events, comments)
   195          reviewers = get_reviewers(events, reviewers)
   196          distilled_events = distill_events(events, distilled_events)
   197  
   198      return _classify_internal(
   199          merged, labels, comments, reviewers, distilled_events, status_fetcher)
   200  
   201  
   202  def _classify_internal(merged, labels, comments, reviewers, distilled_events, status_fetcher):
   203      approvers = get_approvers(comments)
   204  
   205      is_pr = 'head' in merged or 'pull_request' in merged
   206      is_open = merged['state'] != 'closed'
   207      author = merged['user']['login']
   208      assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers)
   209      involved = sorted(u.lower() for u in set([author] + assignees + approvers))
   210  
   211      payload = {
   212          'author': author,
   213          'assignees': assignees,
   214          'title': merged['title'],
   215          'labels': labels,
   216          'xrefs': get_xrefs(comments, merged),
   217      }
   218  
   219      if is_pr:
   220          if is_open:
   221              payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false'
   222          payload['additions'] = merged.get('additions', 0)
   223          payload['deletions'] = merged.get('deletions', 0)
   224          if 'head' in merged:
   225              payload['head'] = merged['head']['sha']
   226  
   227      if approvers:
   228          payload['approvers'] = approvers
   229  
   230      if status_fetcher and 'head' in payload:
   231          payload['status'] = status_fetcher(payload['head'])
   232  
   233      if merged.get('milestone'):
   234          payload['milestone'] = merged['milestone']['title']
   235  
   236      payload['attn'] = calculate_attention(distilled_events, payload)
   237  
   238      return is_pr, is_open, involved, payload
   239  
   240  
   241  def get_xrefs(comments, merged):
   242      xrefs = set(XREF_RE.findall(merged.get('body') or ''))
   243      for c in comments:
   244          xrefs.update(XREF_RE.findall(c['comment']))
   245      return sorted(xrefs)
   246  
   247  
   248  def get_comments(events, comments=None):
   249      """
   250      Pick comments and pull-request review comments out of a list of events.
   251      Args:
   252          events: a list of (event_type str, event_body dict, timestamp).
   253          comments_prev: the previous output of this function.
   254      Returns:
   255          comments: a list of dict(author=..., comment=..., timestamp=...),
   256                    ordered with the earliest comment first.
   257      """
   258      if not comments:
   259          comments = {}
   260      else:
   261          comments = {c['id']: c for c in comments}
   262      comments = {}  # comment_id : comment
   263      for event, body, _timestamp in events:
   264          action = body.get('action')
   265          if event in ('issue_comment', 'pull_request_review_comment'):
   266              comment_id = body['comment']['id']
   267              if action == 'deleted':
   268                  comments.pop(comment_id, None)
   269              else:
   270                  c = body['comment']
   271                  comments[comment_id] = {
   272                      'author': c['user']['login'],
   273                      'comment': c['body'],
   274                      'timestamp': c['created_at'],
   275                      'id': c['id'],
   276                  }
   277      return sorted(comments.values(), key=lambda c: c['timestamp'])
   278  
   279  
   280  def get_reviewers(events, reviewers=None):
   281      """
   282      Return the set of users that have a code review requested or completed.
   283      """
   284      reviewers = reviewers or set()
   285      for event, body, _timestamp in events:
   286          action = body.get('action')
   287          if event == 'pull_request':
   288              if action == 'review_requested':
   289                  if 'requested_reviewer' not in body:
   290                      logging.warning('no reviewer present -- self-review?')
   291                      continue
   292                  reviewers.add(body['requested_reviewer']['login'])
   293              elif action == 'review_request_removed':
   294                  reviewers -= {body['requested_reviewer']['login']}
   295          elif event == 'pull_request_review':
   296              if action == 'submitted':
   297                  reviewers.add(body['sender']['login'])
   298  
   299      return reviewers
   300  
   301  
   302  def get_approvers(comments):
   303      """
   304      Return approvers requested in comments.
   305  
   306      This MUST be kept in sync with mungegithub's getGubernatorMetadata().
   307      """
   308      approvers = []
   309      for comment in comments:
   310          if comment['author'] == 'k8s-merge-robot':
   311              m = APPROVERS_RE.search(comment['comment'])
   312              if m:
   313                  approvers = m.group(1).replace('"', '').split(',')
   314      return approvers
   315  
   316  
   317  def distill_events(events, distilled_events=None):
   318      """
   319      Given a sequence of events, return a series of user-action tuples
   320      relevant to determining user state.
   321      """
   322      bots = [
   323          'k8s-bot',
   324          'k8s-ci-robot',
   325          'k8s-merge-robot',
   326          'k8s-oncall',
   327          'k8s-reviewable',
   328      ]
   329      skip_comments = get_skip_comments(events, bots)
   330  
   331      output = distilled_events or []
   332      for event, body, timestamp in events:
   333          action = body.get('action')
   334          user = body.get('sender', {}).get('login')
   335          if event in ('issue_comment', 'pull_request_review_comment'):
   336              if body['comment']['id'] in skip_comments:
   337                  continue
   338              if action == 'created':
   339                  output.append(('comment', user, timestamp))
   340          if event == 'pull_request_review':
   341              if action == 'submitted':
   342                  # this is morally equivalent to a comment
   343                  output.append(('comment', user, timestamp))
   344          if event == 'pull_request':
   345              if action in ('opened', 'reopened', 'synchronize'):
   346                  output.append(('push', user, timestamp))
   347              if action == 'labeled' and 'label' in body:
   348                  output.append(('label ' + body['label']['name'].lower(), user, timestamp))
   349      return output
   350  
   351  
   352  def evaluate_fsm(events, start, transitions):
   353      """
   354      Given a series of event tuples and a start state, execute the list of transitions
   355      and return the resulting state, the time it entered that state, and the last time
   356      the state would be entered (self-transitions are allowed).
   357  
   358      transitions is a list of tuples
   359      (state_before str, state_after str, condition str or callable)
   360  
   361      The transition occurs if condition equals the action (as a str), or if
   362      condition(action, user) is True.
   363      """
   364      state = start
   365      state_start = 0 # time that we entered this state
   366      state_last = 0  # time of last transition into this state
   367      for action, user, timestamp in events:
   368          for state_before, state_after, condition in transitions:
   369              if state_before is None or state_before == state:
   370                  if condition == action or (callable(condition) and condition(action, user)):
   371                      if state_after != state:
   372                          state_start = timestamp
   373                      state = state_after
   374                      state_last = timestamp
   375                      break
   376      return state, state_start, state_last
   377  
   378  
   379  def get_author_state(author, distilled_events):
   380      """
   381      Determine the state of the author given a series of distilled events.
   382      """
   383      return evaluate_fsm(distilled_events, start='waiting', transitions=[
   384          # before, after, condition
   385          (None, 'address comments', lambda a, u: a == 'comment' and u != author),
   386          ('address comments', 'waiting', 'push'),
   387          ('address comments', 'waiting', lambda a, u: a == 'comment' and u == author),
   388      ])
   389  
   390  
   391  def get_assignee_state(assignee, author, distilled_events):
   392      """
   393      Determine the state of an assignee given a series of distilled events.
   394      """
   395      return evaluate_fsm(distilled_events, start='needs review', transitions=[
   396          # before, after, condition
   397          ('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')),
   398          (None, 'needs review', 'push'),
   399          (None, 'needs review', lambda a, u: a == 'comment' and u == author),
   400      ])
   401  
   402  
   403  def calculate_attention(distilled_events, payload):
   404      """
   405      Given information about an issue, determine who should look at it.
   406  
   407      It can include start and last update time for various states --
   408      "address comments#123#456" means that something has been in 'address comments' since
   409      123, and there was some other event that put it in 'address comments' at 456.
   410      """
   411      author = payload['author']
   412      assignees = payload['assignees']
   413  
   414      attn = {}
   415      def notify(to, reason):
   416          attn[to] = reason
   417  
   418      if any(state == 'failure' for state, _url, _desc
   419             in payload.get('status', {}).values()):
   420          notify(author, 'fix tests')
   421  
   422      for approver in payload.get('approvers', []):
   423          notify(approver, 'needs approval')
   424  
   425      for assignee in assignees:
   426          assignee_state, first, last = get_assignee_state(assignee, author, distilled_events)
   427          if assignee_state != 'waiting':
   428              notify(assignee, '%s#%s#%s' % (assignee_state, first, last))
   429  
   430      author_state, first, last = get_author_state(author, distilled_events)
   431      if author_state != 'waiting':
   432          notify(author, '%s#%s#%s' % (author_state, first, last))
   433  
   434      if payload.get('needs_rebase'):
   435          notify(author, 'needs rebase')
   436      if 'do-not-merge/release-note-label-needed' in payload['labels']:
   437          notify(author, 'needs release-note label')
   438  
   439      return attn