github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/gubernator/github/periodic_sync.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2018 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Periodically synchronize our Datastore view of PRs with Github. 18 19 Various things can cause the local status of a PR to diverge from upstream: 20 dropped hooks from bugs in the app, upstream GitHub bugs (webhooks aren't 21 guaranteed!), or a repo *just* starting sending hooks to Gubernator. 22 23 Divergent PR state make the PR dashboard less useful, since old PRs accumulate 24 and clutter out real items, decreasing signal-to-noise ratio and user trust. 25 26 To handle these, on a regular schedule we perform a reconciliation step: 27 - for each repository that we're tracking: 28 - A = all open PRs from Datastore 29 - B = all open PRs from Github 30 - A-B is the set of improperly open PRs. For each PR, add a synthetic 31 webhook event to Datastore with state=closed, and reprocess. 32 - B-A is the set of improperly closed or missing PRs. Again, inject a 33 synthetic webhook with the details received from GitHub and reprocess. 34 35 This requires a Github token set like other secrets with /config in the root. 36 Total token usage is low: number of open PRs / 100 PRs per list call. 37 As of 2018-01-10, 1666 open PRs in the k8s org translates into ~56 list calls. 38 """ 39 40 import json 41 import logging 42 import re 43 44 import webapp2 45 46 from google.appengine.api import urlfetch 47 from google.appengine.ext import deferred 48 49 import handlers 50 import models 51 import secrets 52 53 PULL_API = 'https://api.github.com/repos/%s/pulls?state=open&per_page=100' 54 55 56 def get_prs_from_github(token, repo): 57 headers = {'Authorization': 'token %s' % token} 58 url = PULL_API % repo 59 prs = [] 60 while True: 61 logging.info('fetching %s', url) 62 response = urlfetch.fetch(url, headers=headers) 63 if response.status_code == 404: 64 logging.warning('repo was deleted?') 65 # Returning no open PRs will make us fake a close event for each of 66 # them, which is appropriate. 67 return [] 68 if response.status_code != 200: 69 raise urlfetch.Error('status code %s' % response.status_code) 70 prs += json.loads(response.content) 71 m = re.search(r'<([^>]+)>; rel="next"', response.headers.get('Link', '')) 72 if m: 73 url = m.group(1) 74 else: 75 break 76 logging.info('pr count: %d, github tokens left: %s', 77 len(prs), response.headers.get('x-ratelimit-remaining')) 78 return prs 79 80 81 def inject_event_and_reclassify(repo, number, action, body): 82 # this follows similar code as handlers.GithubHandler 83 parent = models.GithubResource.make_key(repo, number) 84 hook = models.GithubWebhookRaw( 85 parent=parent, repo=repo, number=number, event='pull_request', 86 body=json.dumps({'action': action, 'pull_request': body}, sort_keys=True)) 87 hook.put() 88 deferred.defer(handlers.update_issue_digest, repo, number) 89 90 91 def sync_repo(token, repo, write_html=None): 92 if write_html is None: 93 write_html = lambda x: None 94 95 logging.info('syncing repo %s', repo) 96 write_html('<h1>%s</h1>' % repo) 97 98 # There is a race condition here: 99 # We can't atomically get a list of PRs from the database and GitHub, 100 # so a PR might falsely be in stale_open_prs if it is opened after 101 # we scan GitHub, or falsely be in missing_prs if a PR is made after we 102 # got the list from GitHub, and before we get the list from the database. 103 # 104 # These cases will both be fixed the next time this code runs, so we don't 105 # try to prevent it here. 106 prs_gh = get_prs_from_github(token, repo) 107 prs_gh_by_number = {pr['number']: pr for pr in prs_gh} 108 109 prs_db = list(models.GHIssueDigest.find_open_prs_for_repo(repo)) 110 prs_db_by_number = {pr.number: pr for pr in prs_db} 111 112 numbers_datastore = set(prs_db_by_number) 113 numbers_github = set(prs_gh_by_number) 114 115 stale_open_prs = sorted(numbers_datastore - numbers_github) 116 missing_prs = sorted(numbers_github - numbers_datastore) 117 118 if not stale_open_prs and not missing_prs: 119 write_html('matched, no further work needed') 120 logging.info('matched, no further work needed') 121 return 122 123 logging.info('PRs to close: %s', stale_open_prs) 124 logging.info('PRs to open: %s', missing_prs) 125 126 write_html('<br>') 127 write_html('PRs that should be closed: %s<br>' % stale_open_prs) 128 129 for number in stale_open_prs: 130 pr = prs_db_by_number[number] 131 write_html('<b>%d</b><br>%s<br>' % (number, pr)) 132 inject_event_and_reclassify(repo, number, 'gh-sync-close', 133 {'state': 'closed', 134 # These other 3 keys are injected because the classifier expects them. 135 # This simplifies the testing code, and means we don't have to inject 136 # fake webhooks. 137 'user': {'login': pr.payload['author']}, 138 'assignees': [{'login': u} for u in pr.payload['assignees']], 139 'title': pr.payload['title']}) 140 141 write_html('PRs that should be opened: %s<br>' % missing_prs) 142 143 for number in missing_prs: 144 pr = models.shrink(prs_gh_by_number[number]) 145 write_html('<br>%d</br><pre>%s</pre><br>' % 146 (number, json.dumps(pr, indent=4, sort_keys=True))) 147 inject_event_and_reclassify(repo, number, 'gh-sync-open', pr) 148 149 150 class PRSync(webapp2.RequestHandler): 151 def get(self): 152 # This is called automatically by the periodic cron scheduler. 153 # For debugging, visit something like /sync?repo=kubernetes/test-infra 154 token = secrets.get('github_token', per_host=False) 155 if not token: 156 logging.warning('no github token, skipping sync') 157 self.abort(200) 158 159 # first, determine which repositories we need to sync 160 open_prs = list( 161 models.GHIssueDigest.find_open_prs().fetch(keys_only=True)) 162 open_repos = sorted({models.GHIssueDigest(key=pr).repo for pr in open_prs}) 163 164 self.response.write('open repos:') 165 self.response.write(', '.join(open_repos)) 166 167 repo = self.request.get('repo') 168 if repo: 169 # debugging case 170 sync_repo(token, repo, self.response.write) 171 else: 172 for repo in open_repos: 173 deferred.defer(sync_repo, token, repo) 174 175 176 app = webapp2.WSGIApplication([ 177 (r'/sync', PRSync), 178 ], debug=True)