github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/github/classifier.py

github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/gubernator/github/classifier.py (about)

     1  # Copyright 2016 The Kubernetes Authors.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #     http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import datetime
    16  import logging
    17  import re
    18  
    19  import google.appengine.ext.ndb as ndb
    20  
    21  import models
    22  
    23  
    24  XREF_RE = re.compile(r'k8s-gubernator.appspot.com/build(/[^])\s]+/\d+)')
    25  APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->')
    26  
    27  
    28  class Deduper(object):
    29      ''' A memory-saving string deduplicator for Python datastructures.
    30  
    31      This is somewhat like the built-in intern() function, but without pinning memory
    32      permanently.
    33  
    34      Tries to reduce memory usage by making equivalent strings point at the same object.
    35      This reduces memory usage for large, repetitive JSON structures by >2x.
    36      '''
    37  
    38      def __init__(self):
    39          self.strings = {}
    40  
    41      def dedup(self, obj):
    42          if isinstance(obj, basestring):
    43              return self.strings.setdefault(obj, obj)
    44          elif isinstance(obj, dict):
    45              return {self.dedup(k): self.dedup(v) for k, v in obj.iteritems()}
    46          elif isinstance(obj, tuple):
    47              return tuple(self.dedup(x) for x in obj)
    48          elif isinstance(obj, list):
    49              return [self.dedup(x) for x in obj]
    50          return obj
    51  
    52  
    53  def classify_issue(repo, number):
    54      '''
    55      Classify an issue in a repo based on events in Datastore.
    56  
    57      Args:
    58          repo: string
    59          number: int
    60      Returns:
    61          is_pr: bool
    62          is_open: bool
    63          involved: list of strings representing usernames involved
    64          payload: a dict, see full description for classify below.
    65          last_event_timestamp: the timestamp of the most recent event.
    66      '''
    67      ancestor = models.GithubResource.make_key(repo, number)
    68      logging.debug('finding webhooks for %s %s', repo, number)
    69      event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor).fetch(keys_only=True))
    70  
    71      logging.debug('classifying %s %s (%d events)', repo, number, len(event_keys))
    72      event_tuples = []
    73      last_event_timestamp = datetime.datetime(2000, 1, 1)
    74  
    75  
    76      if len(event_keys) > 800:
    77          logging.warning('too many events. blackholing.')
    78          return False, False, [], {'num_events': len(event_keys)}, last_event_timestamp
    79  
    80      deduper = Deduper()
    81  
    82      for x in xrange(0, len(event_keys), 100):
    83          events = ndb.get_multi(event_keys[x:x+100])
    84          last_event_timestamp = max(last_event_timestamp, max(e.timestamp for e in events))
    85          event_tuples.extend([deduper.dedup(event.to_tuple()) for event in events])
    86  
    87      event_tuples.sort(key=lambda x: x[2])  # sort by timestamp
    88  
    89      del deduper  # attempt to save memory
    90      del events
    91  
    92      merged = get_merged(event_tuples)
    93      statuses = None
    94      if 'head' in merged:
    95          statuses = {}
    96          for status in models.GHStatus.query_for_sha(repo, merged['head']['sha']):
    97              last_event_timestamp = max(last_event_timestamp, status.updated_at)
    98              statuses[status.context] = [
    99                  status.state, status.target_url, status.description]
   100  
   101      return list(classify(event_tuples, statuses)) + [last_event_timestamp]
   102  
   103  
   104  def get_merged(events):
   105      '''
   106      Determine the most up-to-date view of the issue given its inclusion
   107      in a series of events.
   108  
   109      Note that different events have different levels of detail-- comments
   110      don't include head SHA information, pull request events don't have label
   111      information, etc.
   112  
   113      Args:
   114          events: a list of (event_type str, event_body dict, timestamp).
   115      Returns:
   116          body: a dict representing the issue's latest state.
   117      '''
   118      merged = {}
   119      for _event, body, _timestamp in events:
   120          if 'issue' in body:
   121              merged.update(body['issue'])
   122          if 'pull_request' in body:
   123              merged.update(body['pull_request'])
   124      return merged
   125  
   126  
   127  def get_labels(events):
   128      '''
   129      Determine the labels applied to an issue.
   130  
   131      Args:
   132          events: a list of (event_type str, event_body dict, timestamp).
   133      Returns:
   134          labels: the currently applied labels as {label_name: label_color}
   135      '''
   136      labels = []
   137      for event, body, _timestamp in events:
   138          if 'issue' in body:
   139              # issues come with labels, so we can update here
   140              labels = body['issue']['labels']
   141          # pull_requests don't include their full labels :(
   142          action = body.get('action')
   143          if event == 'pull_request':
   144              # Pull request label events don't come with a full label set.
   145              # Track them explicitly here.
   146              try:
   147                  if action in ('labeled', 'unlabeled') and 'label' not in body:
   148                      logging.warning('label event with no labels (multiple changes?)')
   149                  elif action == 'labeled':
   150                      label = body['label']
   151                      if label not in labels:
   152                          labels.append(label)
   153                  elif action == 'unlabeled':
   154                      label = body['label']
   155                      if label in labels:
   156                          labels.remove(label)
   157              except:
   158                  logging.exception('??? %r', body)
   159                  raise
   160      return {label['name']: label['color'] for label in labels}
   161  
   162  
   163  def get_skip_comments(events, skip_users=None):
   164      '''
   165      Determine comment ids that should be ignored, either because of
   166          deletion or because the user should be skipped.
   167  
   168      Args:
   169          events: a list of (event_type str, event_body dict, timestamp).
   170      Returns:
   171          comment_ids: a set of comment ids that were deleted or made by
   172              users that should be skiped.
   173      '''
   174      if skip_users is None:
   175          skip_users = []
   176  
   177      skip_comments = set()
   178      for event, body, _timestamp in events:
   179          action = body.get('action')
   180          if event in ('issue_comment', 'pull_request_review_comment'):
   181              comment_id = body['comment']['id']
   182              if action == 'deleted' or body['sender']['login'] in skip_users:
   183                  skip_comments.add(comment_id)
   184      return skip_comments
   185  
   186  
   187  def classify(events, statuses=None):
   188      '''
   189      Given an event-stream for an issue and status-getter, process
   190      the events and determine what action should be taken, if any.
   191  
   192      Args:
   193          events: a list of (event_type str, event_body dict, timestamp).
   194      Returns:
   195          is_pr: bool
   196          is_open: bool
   197          involved: list of strings representing usernames involved
   198          payload: a dictionary of additional information, including:
   199              {
   200                  'author': str author_name,
   201                  'title': str issue title,
   202                  'labels': {label_name: label_color},
   203                  'attn': {user_name: reason},
   204                  'mergeable': bool,
   205                  'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}],
   206                  'xrefs': list of builds referenced (by GCS path),
   207              }
   208      '''
   209      merged = get_merged(events)
   210      labels = get_labels(events)
   211      comments = get_comments(events)
   212      xrefs = get_xrefs(comments, merged)
   213      approvers = get_approvers(comments)
   214      reviewers = get_reviewers(events)
   215  
   216      is_pr = 'head' in merged or 'pull_request' in merged
   217      is_open = merged['state'] != 'closed'
   218      author = merged['user']['login']
   219      assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers)
   220      involved = sorted(set([author] + assignees + approvers))
   221  
   222      payload = {
   223          'author': author,
   224          'assignees': assignees,
   225          'title': merged['title'],
   226          'labels': labels,
   227          'xrefs': xrefs,
   228      }
   229  
   230      if is_pr:
   231          if is_open:
   232              payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false'
   233          payload['additions'] = merged.get('additions', 0)
   234          payload['deletions'] = merged.get('deletions', 0)
   235          if 'head' in merged:
   236              payload['head'] = merged['head']['sha']
   237  
   238      if statuses:
   239          payload['status'] = statuses
   240  
   241      if approvers:
   242          payload['approvers'] = approvers
   243  
   244      payload['attn'] = calculate_attention(distill_events(events), payload)
   245  
   246      return is_pr, is_open, involved, payload
   247  
   248  
   249  def get_xrefs(comments, merged):
   250      xrefs = set(XREF_RE.findall(merged.get('body') or ''))
   251      for c in comments:
   252          xrefs.update(XREF_RE.findall(c['comment']))
   253      return sorted(xrefs)
   254  
   255  
   256  def get_comments(events):
   257      '''
   258      Pick comments and pull-request review comments out of a list of events.
   259      Args:
   260          events: a list of (event_type str, event_body dict, timestamp).
   261      Returns:
   262          comments: a list of dict(author=..., comment=..., timestamp=...),
   263                    ordered with the earliest comment first.
   264      '''
   265      comments = {}  # comment_id : comment
   266      for event, body, _timestamp in events:
   267          action = body.get('action')
   268          if event in ('issue_comment', 'pull_request_review_comment'):
   269              comment_id = body['comment']['id']
   270              if action == 'deleted':
   271                  comments.pop(comment_id, None)
   272              else:
   273                  comments[comment_id] = body['comment']
   274      return [
   275              {
   276                  'author': c['user']['login'],
   277                  'comment': c['body'],
   278                  'timestamp': c['created_at']
   279              }
   280              for c in sorted(comments.values(), key=lambda c: c['created_at'])
   281      ]
   282  
   283  
   284  def get_reviewers(events):
   285      '''
   286      Return the set of users that have a code review requested or completed.
   287      '''
   288      reviewers = set()
   289      for event, body, _timestamp in events:
   290          action = body.get('action')
   291          if event == 'pull_request':
   292              if action == 'review_requested':
   293                  if 'requested_reviewer' not in body:
   294                      logging.warning('no reviewer present -- self-review?')
   295                      continue
   296                  reviewers.add(body['requested_reviewer']['login'])
   297              elif action == 'review_request_removed':
   298                  reviewers -= {body['requested_reviewer']['login']}
   299      return reviewers
   300  
   301  
   302  def get_approvers(comments):
   303      '''
   304      Return approvers requested in comments.
   305  
   306      This MUST be kept in sync with mungegithub's getGubernatorMetadata().
   307      '''
   308      approvers = []
   309      for comment in comments:
   310          if comment['author'] == 'k8s-merge-robot':
   311              m = APPROVERS_RE.search(comment['comment'])
   312              if m:
   313                  approvers = m.group(1).replace('"', '').split(',')
   314      return approvers
   315  
   316  
   317  def distill_events(events):
   318      '''
   319      Given a sequence of events, return a series of user-action tuples
   320      relevant to determining user state.
   321      '''
   322      bots = [
   323          'k8s-bot',
   324          'k8s-ci-robot',
   325          'k8s-merge-robot',
   326          'k8s-oncall',
   327          'k8s-reviewable',
   328      ]
   329      skip_comments = get_skip_comments(events, bots)
   330  
   331      output = []
   332      for event, body, timestamp in events:
   333          action = body.get('action')
   334          user = body.get('sender', {}).get('login')
   335          if event in ('issue_comment', 'pull_request_review_comment'):
   336              if body['comment']['id'] in skip_comments:
   337                  continue
   338              if action == 'created':
   339                  output.append(('comment', user, timestamp))
   340          if event == 'pull_request_review':
   341              if action == 'submitted':
   342                  # this is morally equivalent to a comment
   343                  output.append(('comment', user, timestamp))
   344          if event == 'pull_request':
   345              if action in ('opened', 'reopened', 'synchronize'):
   346                  output.append(('push', user, timestamp))
   347              if action == 'labeled' and 'label' in body:
   348                  output.append(('label ' + body['label']['name'].lower(), user, timestamp))
   349      return output
   350  
   351  
   352  def evaluate_fsm(events, start, transitions):
   353      '''
   354      Given a series of event tuples and a start state, execute the list of transitions
   355      and return the resulting state, the time it entered that state, and the last time
   356      the state would be entered (self-transitions are allowed).
   357  
   358      transitions is a list of tuples
   359      (state_before str, state_after str, condition str or callable)
   360  
   361      The transition occurs if condition equals the action (as a str), or if
   362      condition(action, user) is True.
   363      '''
   364      state = start
   365      state_start = 0 # time that we entered this state
   366      state_last = 0  # time of last transition into this state
   367      for action, user, timestamp in events:
   368          for state_before, state_after, condition in transitions:
   369              if state_before is None or state_before == state:
   370                  if condition == action or (callable(condition) and condition(action, user)):
   371                      if state_after != state:
   372                          state_start = timestamp
   373                      state = state_after
   374                      state_last = timestamp
   375                      break
   376      return state, state_start, state_last
   377  
   378  
   379  def get_author_state(author, distilled_events):
   380      '''
   381      Determine the state of the author given a series of distilled events.
   382      '''
   383      return evaluate_fsm(distilled_events, start='waiting', transitions=[
   384          # before, after, condition
   385          (None, 'address comments', lambda a, u: a == 'comment' and u != author),
   386          ('address comments', 'waiting', 'push'),
   387          ('address comments', 'waiting', lambda a, u: a == 'comment' and u == author),
   388      ])
   389  
   390  
   391  def get_assignee_state(assignee, author, distilled_events):
   392      '''
   393      Determine the state of an assignee given a series of distilled events.
   394      '''
   395      return evaluate_fsm(distilled_events, start='needs review', transitions=[
   396          # before, after, condition
   397          ('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')),
   398          (None, 'needs review', 'push'),
   399          (None, 'needs review', lambda a, u: a == 'comment' and u == author),
   400      ])
   401  
   402  
   403  def calculate_attention(distilled_events, payload):
   404      '''
   405      Given information about an issue, determine who should look at it.
   406  
   407      It can include start and last update time for various states --
   408      "address comments#123#456" means that something has been in 'address comments' since
   409      123, and there was some other event that put it in 'address comments' at 456.
   410      '''
   411      author = payload['author']
   412      assignees = payload['assignees']
   413  
   414      attn = {}
   415      def notify(to, reason):
   416          attn[to] = reason
   417  
   418      if any(state == 'failure' for state, _url, _desc
   419             in payload.get('status', {}).values()):
   420          notify(author, 'fix tests')
   421  
   422      for approver in payload.get('approvers', []):
   423          notify(approver, 'needs approval')
   424  
   425      for assignee in assignees:
   426          assignee_state, first, last = get_assignee_state(assignee, author, distilled_events)
   427          if assignee_state != 'waiting':
   428              notify(assignee, '%s#%s#%s' % (assignee_state, first, last))
   429  
   430      author_state, first, last = get_author_state(author, distilled_events)
   431      if author_state != 'waiting':
   432          notify(author, '%s#%s#%s' % (author_state, first, last))
   433  
   434      if payload.get('needs_rebase'):
   435          notify(author, 'needs rebase')
   436      if 'release-note-label-needed' in payload['labels']:
   437          notify(author, 'needs release-note label')
   438  
   439      return attn