github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/github/periodic_sync.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2018 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Periodically synchronize our Datastore view of PRs with Github.
    18  
    19  Various things can cause the local status of a PR to diverge from upstream:
    20  dropped hooks from bugs in the app, upstream GitHub bugs (webhooks aren't
    21  guaranteed!), or a repo *just* starting sending hooks to Gubernator.
    22  
    23  Divergent PR state make the PR dashboard less useful, since old PRs accumulate
    24  and clutter out real items, decreasing signal-to-noise ratio and user trust.
    25  
    26  To handle these, on a regular schedule we perform a reconciliation step:
    27  - for each repository that we're tracking:
    28    - A = all open PRs from Datastore
    29    - B = all open PRs from Github
    30    - A-B is the set of improperly open PRs. For each PR, add a synthetic
    31      webhook event to Datastore with state=closed, and reprocess.
    32    - B-A is the set of improperly closed or missing PRs. Again, inject a
    33      synthetic webhook with the details received from GitHub and reprocess.
    34  
    35  This requires a Github token set like other secrets with /config in the root.
    36  Total token usage is low: number of open PRs / 100 PRs per list call.
    37  As of 2018-01-10, 1666 open PRs in the k8s org translates into ~56 list calls.
    38  """
    39  
    40  import json
    41  import logging
    42  import re
    43  
    44  import webapp2
    45  
    46  from google.appengine.api import urlfetch
    47  from google.appengine.ext import deferred
    48  
    49  import handlers
    50  import models
    51  import secrets
    52  
    53  PULL_API = 'https://api.github.com/repos/%s/pulls?state=open&per_page=100'
    54  
    55  
    56  def get_prs_from_github(token, repo):
    57      headers = {'Authorization': 'token %s' % token}
    58      url = PULL_API % repo
    59      prs = []
    60      while True:
    61          logging.info('fetching %s', url)
    62          response = urlfetch.fetch(url, headers=headers)
    63          if response.status_code == 404:
    64              logging.warning('repo was deleted?')
    65              # Returning no open PRs will make us fake a close event for each of
    66              # them, which is appropriate.
    67              return []
    68          if response.status_code != 200:
    69              raise urlfetch.Error('status code %s' % response.status_code)
    70          prs += json.loads(response.content)
    71          m = re.search(r'<([^>]+)>; rel="next"', response.headers.get('Link', ''))
    72          if m:
    73              url = m.group(1)
    74          else:
    75              break
    76      logging.info('pr count: %d, github tokens left: %s',
    77                   len(prs), response.headers.get('x-ratelimit-remaining'))
    78      return prs
    79  
    80  
    81  def inject_event_and_reclassify(repo, number, action, body):
    82      # this follows similar code as handlers.GithubHandler
    83      parent = models.GithubResource.make_key(repo, number)
    84      hook = models.GithubWebhookRaw(
    85          parent=parent, repo=repo, number=number, event='pull_request',
    86          body=json.dumps({'action': action, 'pull_request': body}, sort_keys=True))
    87      hook.put()
    88      deferred.defer(handlers.update_issue_digest, repo, number)
    89  
    90  
    91  def sync_repo(token, repo, write_html=None):
    92      if write_html is None:
    93          write_html = lambda x: None
    94  
    95      logging.info('syncing repo %s', repo)
    96      write_html('<h1>%s</h1>' % repo)
    97  
    98      # There is a race condition here:
    99      # We can't atomically get a list of PRs from the database and GitHub,
   100      # so a PR might falsely be in stale_open_prs if it is opened after
   101      # we scan GitHub, or falsely be in missing_prs if a PR is made after we
   102      # got the list from GitHub, and before we get the list from the database.
   103      #
   104      # These cases will both be fixed the next time this code runs, so we don't
   105      # try to prevent it here.
   106      prs_gh = get_prs_from_github(token, repo)
   107      prs_gh_by_number = {pr['number']: pr for pr in prs_gh}
   108  
   109      prs_db = list(models.GHIssueDigest.find_open_prs_for_repo(repo))
   110      prs_db_by_number = {pr.number: pr for pr in prs_db}
   111  
   112      numbers_datastore = set(prs_db_by_number)
   113      numbers_github = set(prs_gh_by_number)
   114  
   115      stale_open_prs = sorted(numbers_datastore - numbers_github)
   116      missing_prs = sorted(numbers_github - numbers_datastore)
   117  
   118      if not stale_open_prs and not missing_prs:
   119          write_html('matched, no further work needed')
   120          logging.info('matched, no further work needed')
   121          return
   122  
   123      logging.info('PRs to close: %s', stale_open_prs)
   124      logging.info('PRs to open: %s', missing_prs)
   125  
   126      write_html('<br>')
   127      write_html('PRs that should be closed: %s<br>' % stale_open_prs)
   128  
   129      for number in stale_open_prs:
   130          pr = prs_db_by_number[number]
   131          write_html('<b>%d</b><br>%s<br>' % (number, pr))
   132          inject_event_and_reclassify(repo, number, 'gh-sync-close',
   133              {'state': 'closed',
   134               # These other 3 keys are injected because the classifier expects them.
   135               # This simplifies the testing code, and means we don't have to inject
   136               # fake webhooks.
   137               'user': {'login': pr.payload['author']},
   138               'assignees': [{'login': u} for u in pr.payload['assignees']],
   139               'title': pr.payload['title']})
   140  
   141      write_html('PRs that should be opened: %s<br>' % missing_prs)
   142  
   143      for number in missing_prs:
   144          pr = models.shrink(prs_gh_by_number[number])
   145          write_html('<br>%d</br><pre>%s</pre><br>' %
   146              (number, json.dumps(pr, indent=4, sort_keys=True)))
   147          inject_event_and_reclassify(repo, number, 'gh-sync-open', pr)
   148  
   149  
   150  class PRSync(webapp2.RequestHandler):
   151      def get(self):
   152          # This is called automatically by the periodic cron scheduler.
   153          # For debugging, visit something like /sync?repo=kubernetes/test-infra
   154          token = secrets.get('github_token', per_host=False)
   155          if not token:
   156              logging.warning('no github token, skipping sync')
   157              self.abort(200)
   158  
   159          # first, determine which repositories we need to sync
   160          open_prs = list(
   161              models.GHIssueDigest.find_open_prs().fetch(keys_only=True))
   162          open_repos = sorted({models.GHIssueDigest(key=pr).repo for pr in open_prs})
   163  
   164          self.response.write('open repos:')
   165          self.response.write(', '.join(open_repos))
   166  
   167          repo = self.request.get('repo')
   168          if repo:
   169              # debugging case
   170              sync_repo(token, repo, self.response.write)
   171          else:
   172              for repo in open_repos:
   173                  deferred.defer(sync_repo, token, repo)
   174  
   175  
   176  app = webapp2.WSGIApplication([
   177      (r'/sync', PRSync),
   178  ], debug=True)