k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/gubernator/github/classifier.py

k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/gubernator/github/classifier.py (about)

     1  # Copyright 2016 The Kubernetes Authors.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #     http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import datetime
    16  import logging
    17  import re
    18  
    19  import google.appengine.ext.ndb as ndb
    20  
    21  import models
    22  
    23  
    24  XREF_RE = re.compile(r'(?:k8s-gubernator\.appspot\.com|gubernator\.k8s\.io)/build(/[^])\s]+/\d+)')
    25  APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->')
    26  
    27  
    28  def classify_issue(repo, number):
    29      """
    30      Classify an issue in a repo based on events in Datastore.
    31  
    32      Args:
    33          repo: string
    34          number: int
    35      Returns:
    36          is_pr: bool
    37          is_open: bool
    38          involved: list of strings representing usernames involved
    39          payload: a dict, see full description for classify below.
    40          last_event_timestamp: the timestamp of the most recent event.
    41      """
    42      ancestor = models.GithubResource.make_key(repo, number)
    43      logging.info('finding webhooks for %s %s', repo, number)
    44      event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor)
    45          .order(models.GithubWebhookRaw.timestamp)
    46          .fetch(keys_only=True))
    47  
    48      logging.info('classifying %s %s (%d events)', repo, number, len(event_keys))
    49      last_event_timestamp = [datetime.datetime(2000, 1, 1)]
    50  
    51      def events_iterator():
    52          for x in xrange(0, len(event_keys), 100):
    53              events = ndb.get_multi(event_keys[x:x+100])
    54              for event in events:
    55                  last_event_timestamp[0] = max(last_event_timestamp[0], event.timestamp)
    56              yield [event.to_tuple() for event in events]
    57  
    58      def get_status_for(sha):
    59          statuses = {}
    60          for status in models.GHStatus.query_for_sha(repo, sha):
    61              last_event_timestamp[0] = max(last_event_timestamp[0], status.updated_at)
    62              statuses[status.context] = [
    63                  status.state, status.target_url, status.description]
    64          return statuses
    65  
    66      classified = classify_from_iterator(events_iterator(), status_fetcher=get_status_for)
    67      return list(classified) + last_event_timestamp
    68  
    69  
    70  def get_merged(events, merged=None):
    71      """
    72      Determine the most up-to-date view of the issue given its inclusion
    73      in a series of events.
    74  
    75      Note that different events have different levels of detail-- comments
    76      don't include head SHA information, pull request events don't have label
    77      information, etc.
    78  
    79      Args:
    80          events: a list of (event_type str, event_body dict, timestamp).
    81          merged: the result of a previous invocation.
    82      Returns:
    83          body: a dict representing the issue's latest state.
    84      """
    85      merged = merged or {}
    86      for _event, body, _timestamp in events:
    87          if 'issue' in body:
    88              merged.update(body['issue'])
    89          if 'pull_request' in body:
    90              merged.update(body['pull_request'])
    91      return merged
    92  
    93  
    94  def get_labels(events, labels=None):
    95      """
    96      Determine the labels applied to an issue.
    97  
    98      Args:
    99          events: a list of (event_type str, event_body dict, timestamp).
   100      Returns:
   101          labels: the currently applied labels as {label_name: label_color}
   102      """
   103      labels = labels or {}
   104      for event, body, _timestamp in events:
   105          if 'issue' in body:
   106              # issues come with labels, so we can update here
   107              labels = {l['name']: l['color'] for l in body['issue']['labels']}
   108          # pull_requests don't include their full labels :(
   109          action = body.get('action')
   110          if event == 'pull_request':
   111              # Pull request label events don't come with a full label set.
   112              # Track them explicitly here.
   113              try:
   114                  if action in ('labeled', 'unlabeled') and 'label' not in body:
   115                      logging.warning('label event with no labels (multiple changes?)')
   116                  elif action == 'labeled':
   117                      label = body['label']
   118                      if label['name'] not in labels:
   119                          labels[label['name']] = label['color']
   120                  elif action == 'unlabeled':
   121                      labels.pop(body['label']['name'], None)
   122              except:
   123                  logging.exception('??? %r', body)
   124                  raise
   125      return labels
   126  
   127  
   128  def get_skip_comments(events, skip_users=None):
   129      """
   130      Determine comment ids that should be ignored, either because of
   131          deletion or because the user should be skipped.
   132  
   133      Args:
   134          events: a list of (event_type str, event_body dict, timestamp).
   135      Returns:
   136          comment_ids: a set of comment ids that were deleted or made by
   137              users that should be skipped.
   138      """
   139      skip_users = skip_users or []
   140      skip_comments = set()
   141      for event, body, _timestamp in events:
   142          action = body.get('action')
   143          if event in ('issue_comment', 'pull_request_review_comment'):
   144              comment_id = body['comment']['id']
   145              if action == 'deleted' or body['sender']['login'] in skip_users:
   146                  skip_comments.add(comment_id)
   147      return skip_comments
   148  
   149  def classify(events, status_fetcher=None):
   150      """
   151      Given an event-stream for an issue and status-getter, process
   152      the events and determine what action should be taken, if any.
   153  
   154      Args: One of:
   155          events: a list of (event_type str, event_body dict, timestamp).
   156          events_iterator: an iterable yielding successive events lists
   157          status_fetcher: a function that returns statuses for the given SHA.
   158      Returns:
   159          is_pr: bool
   160          is_open: bool
   161          involved: list of strings representing usernames involved
   162          payload: a dictionary of additional information, including:
   163              {
   164                  'author': str author_name,
   165                  'title': str issue title,
   166                  'labels': {label_name: label_color},
   167                  'attn': {user_name: reason},
   168                  'mergeable': bool,
   169                  'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}],
   170                  'xrefs': list of builds referenced (by GCS path),
   171              }
   172      """
   173      merged = get_merged(events)
   174      labels = get_labels(events)
   175      comments = get_comments(events)
   176      reviewers = get_reviewers(events)
   177      distilled_events = distill_events(events)
   178  
   179      return _classify_internal(
   180          merged, labels, comments, reviewers, distilled_events, status_fetcher)
   181  
   182  
   183  def classify_from_iterator(events_iterator, status_fetcher=None):
   184      """Like classify(), but process batches of events from an iterator."""
   185      merged = None
   186      labels = None
   187      comments = None
   188      reviewers = None
   189      distilled_events = None
   190  
   191      for events in events_iterator:
   192          merged = get_merged(events, merged)
   193          labels = get_labels(events, labels)
   194          comments = get_comments(events, comments)
   195          reviewers = get_reviewers(events, reviewers)
   196          distilled_events = distill_events(events, distilled_events)
   197  
   198      return _classify_internal(
   199          merged, labels, comments, reviewers, distilled_events, status_fetcher)
   200  
   201  
   202  def _classify_internal(merged, labels, comments, reviewers, distilled_events, status_fetcher):
   203      approvers = get_approvers(comments)
   204  
   205      is_pr = 'head' in merged or 'pull_request' in merged
   206      is_open = merged['state'] != 'closed'
   207      author = merged['user']['login']
   208      assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers)
   209      involved = sorted(u.lower() for u in set([author] + assignees + approvers))
   210  
   211      payload = {
   212          'author': author,
   213          'assignees': assignees,
   214          'title': merged['title'],
   215          'labels': labels,
   216          'xrefs': get_xrefs(comments, merged),
   217      }
   218  
   219      if is_pr:
   220          if is_open:
   221              payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false'
   222          payload['additions'] = merged.get('additions', 0)
   223          payload['deletions'] = merged.get('deletions', 0)
   224          if 'head' in merged:
   225              payload['head'] = merged['head']['sha']
   226  
   227      if approvers:
   228          payload['approvers'] = approvers
   229  
   230      if status_fetcher and 'head' in payload:
   231          payload['status'] = status_fetcher(payload['head'])
   232  
   233      if merged.get('milestone'):
   234          payload['milestone'] = merged['milestone']['title']
   235  
   236      payload['attn'] = calculate_attention(distilled_events, payload)
   237  
   238      return is_pr, is_open, involved, payload
   239  
   240  
   241  def get_xrefs(comments, merged):
   242      xrefs = set(XREF_RE.findall(merged.get('body') or ''))
   243      for c in comments:
   244          xrefs.update(XREF_RE.findall(c['comment']))
   245      return sorted(xrefs)
   246  
   247  
   248  def get_comments(events, comments=None):
   249      """
   250      Pick comments and pull-request review comments out of a list of events.
   251      Args:
   252          events: a list of (event_type str, event_body dict, timestamp).
   253          comments_prev: the previous output of this function.
   254      Returns:
   255          comments: a list of dict(author=..., comment=..., timestamp=...),
   256                    ordered with the earliest comment first.
   257      """
   258      if not comments:
   259          comments = {}
   260      else:
   261          comments = {c['id']: c for c in comments}
   262      comments = {}  # comment_id : comment
   263      for event, body, _timestamp in events:
   264          action = body.get('action')
   265          if event in ('issue_comment', 'pull_request_review_comment'):
   266              comment_id = body['comment']['id']
   267              if action == 'deleted':
   268                  comments.pop(comment_id, None)
   269              else:
   270                  c = body['comment']
   271                  comments[comment_id] = {
   272                      'author': c['user']['login'],
   273                      'comment': c['body'],
   274                      'timestamp': c['created_at'],
   275                      'id': c['id'],
   276                  }
   277      return sorted(comments.values(), key=lambda c: c['timestamp'])
   278  
   279  
   280  def get_reviewers(events, reviewers=None):
   281      """
   282      Return the set of users that have a code review requested or completed.
   283      """
   284      reviewers = reviewers or set()
   285      for event, body, _timestamp in events:
   286          action = body.get('action')
   287          if event == 'pull_request':
   288              if action == 'review_requested':
   289                  if 'requested_reviewer' not in body:
   290                      logging.warning('no reviewer present -- self-review?')
   291                      continue
   292                  reviewers.add(body['requested_reviewer']['login'])
   293              elif action == 'review_request_removed':
   294                  reviewers -= {body['requested_reviewer']['login']}
   295          elif event == 'pull_request_review':
   296              if action == 'submitted':
   297                  reviewers.add(body['sender']['login'])
   298  
   299      return reviewers
   300  
   301  
   302  def get_approvers(comments):
   303      """
   304      Return approvers requested in comments.
   305  
   306      This MUST be kept in sync with mungegithub's getGubernatorMetadata().
   307      """
   308      approvers = []
   309      for comment in comments:
   310          if comment['author'] == 'k8s-merge-robot':
   311              m = APPROVERS_RE.search(comment['comment'])
   312              if m:
   313                  approvers = m.group(1).replace('"', '').split(',')
   314      return approvers
   315  
   316  
   317  def distill_events(events, distilled_events=None):
   318      """
   319      Given a sequence of events, return a series of user-action tuples
   320      relevant to determining user state.
   321      """
   322      bots = [
   323          'google-oss-robot',
   324          'istio-testing',
   325          'k8s-bot',
   326          'k8s-ci-robot',
   327          'k8s-merge-robot',
   328          'k8s-oncall',
   329          'k8s-reviewable',
   330      ]
   331      skip_comments = get_skip_comments(events, bots)
   332  
   333      output = distilled_events or []
   334      for event, body, timestamp in events:
   335          action = body.get('action')
   336          user = body.get('sender', {}).get('login')
   337          if event in ('issue_comment', 'pull_request_review_comment'):
   338              if body['comment']['id'] in skip_comments:
   339                  continue
   340              if action == 'created':
   341                  output.append(('comment', user, timestamp))
   342          if event == 'pull_request_review':
   343              if action == 'submitted':
   344                  # this is morally equivalent to a comment
   345                  output.append(('comment', user, timestamp))
   346          if event == 'pull_request':
   347              if action in ('opened', 'reopened', 'synchronize'):
   348                  output.append(('push', user, timestamp))
   349              if action == 'labeled' and 'label' in body:
   350                  output.append(('label ' + body['label']['name'].lower(), user, timestamp))
   351      return output
   352  
   353  
   354  def evaluate_fsm(events, start, transitions):
   355      """
   356      Given a series of event tuples and a start state, execute the list of transitions
   357      and return the resulting state, the time it entered that state, and the last time
   358      the state would be entered (self-transitions are allowed).
   359  
   360      transitions is a list of tuples
   361      (state_before str, state_after str, condition str or callable)
   362  
   363      The transition occurs if condition equals the action (as a str), or if
   364      condition(action, user) is True.
   365      """
   366      state = start
   367      state_start = 0 # time that we entered this state
   368      state_last = 0  # time of last transition into this state
   369      for action, user, timestamp in events:
   370          for state_before, state_after, condition in transitions:
   371              if state_before is None or state_before == state:
   372                  if condition == action or (callable(condition) and condition(action, user)):
   373                      if state_after != state:
   374                          state_start = timestamp
   375                      state = state_after
   376                      state_last = timestamp
   377                      break
   378      return state, state_start, state_last
   379  
   380  
   381  def get_author_state(author, distilled_events):
   382      """
   383      Determine the state of the author given a series of distilled events.
   384      """
   385      return evaluate_fsm(distilled_events, start='waiting', transitions=[
   386          # before, after, condition
   387          (None, 'address comments', lambda a, u: a == 'comment' and u != author),
   388          ('address comments', 'waiting', 'push'),
   389          ('address comments', 'waiting', lambda a, u: a == 'comment' and u == author),
   390      ])
   391  
   392  
   393  def get_assignee_state(assignee, author, distilled_events):
   394      """
   395      Determine the state of an assignee given a series of distilled events.
   396      """
   397      return evaluate_fsm(distilled_events, start='needs review', transitions=[
   398          # before, after, condition
   399          ('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')),
   400          (None, 'needs review', 'push'),
   401          (None, 'needs review', lambda a, u: a == 'comment' and u == author),
   402      ])
   403  
   404  
   405  def calculate_attention(distilled_events, payload):
   406      """
   407      Given information about an issue, determine who should look at it.
   408  
   409      It can include start and last update time for various states --
   410      "address comments#123#456" means that something has been in 'address comments' since
   411      123, and there was some other event that put it in 'address comments' at 456.
   412      """
   413      author = payload['author']
   414      assignees = payload['assignees']
   415  
   416      attn = {}
   417      def notify(to, reason):
   418          attn[to] = reason
   419  
   420      if any(state == 'failure' for state, _url, _desc
   421             in payload.get('status', {}).values()):
   422          notify(author, 'fix tests')
   423  
   424      for approver in payload.get('approvers', []):
   425          notify(approver, 'needs approval')
   426  
   427      for assignee in assignees:
   428          assignee_state, first, last = get_assignee_state(assignee, author, distilled_events)
   429          if assignee_state != 'waiting':
   430              notify(assignee, '%s#%s#%s' % (assignee_state, first, last))
   431  
   432      author_state, first, last = get_author_state(author, distilled_events)
   433      if author_state != 'waiting':
   434          notify(author, '%s#%s#%s' % (author_state, first, last))
   435  
   436      if payload.get('needs_rebase'):
   437          notify(author, 'needs rebase')
   438      if 'do-not-merge/release-note-label-needed' in payload['labels']:
   439          notify(author, 'needs release-note label')
   440  
   441      return attn