github.com/abayer/test-infra@v0.0.5/queue_health/graph/graph.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Read historical samples of merge queue and plot them.""" 18 19 from __future__ import division 20 21 import collections 22 import cStringIO 23 import datetime 24 import gzip 25 import os 26 import pprint 27 import subprocess 28 import sys 29 import time 30 import traceback 31 32 # pylint: disable=import-error,wrong-import-position 33 try: 34 import matplotlib 35 matplotlib.use('Agg') # For savefig 36 37 import matplotlib.dates as mdates 38 import matplotlib.patches as mpatches 39 import matplotlib.pyplot as plt 40 41 import numpy 42 except ImportError: 43 # TODO(fejta): figure out how to add matplotlib and numpy to the bazel 44 # workspace. Until then hack around this for unit testing. 45 # pylint: disable=invalid-name 46 numpy = mdates = mpatches = plt = NotImplementedError 47 # pylint: enable=wrong-import-position,import-error 48 49 DAYS = 21 # Graph this many days of history. 50 51 def mean(*a): 52 """Calculate the mean for items.""" 53 return numpy.mean(*a) # pylint: disable=no-value-for-parameter 54 55 def parse_line( 56 date, timenow, online, pulls, queue, 57 run, blocked, merge_count=0): # merge_count may be missing 58 """Parse a sq/history.txt line.""" 59 if '.' not in timenow: 60 timenow = '%s.0' % timenow 61 return ( 62 datetime.datetime.strptime( 63 '%s %s' % (date, timenow), 64 '%Y-%m-%d %H:%M:%S.%f'), 65 online == 'True', # Merge queue is down/initializing 66 int(pulls), # Number of open PRs 67 int(queue), # PRs in the queue 68 int(run), # Totally worthless 69 blocked == 'True', # Cannot merge 70 int(merge_count), # Number of merges 71 ) 72 73 def fresh_color(tick): 74 """Return pyplot color for freshness of data.""" 75 if datetime.datetime.utcnow() - tick < datetime.timedelta(hours=1): 76 return 'k' # black 77 return 'r' 78 79 80 def merge_color(rate): 81 """Return pyplot color for merge rate.""" 82 if rate < 15: 83 return 'r' 84 if rate < 30: 85 return 'y' 86 return 'g' 87 88 89 def backlog_color(backlog): 90 """Return pyplot color for queue backlog.""" 91 if backlog < 5: 92 return 'g' 93 if backlog > 24: 94 return 'r' 95 return 'y' 96 97 98 def happy_color(health): 99 """Return pyplot color for health percentage.""" 100 if health > 0.8: 101 return 'g' 102 if health > 0.6: 103 return 'y' 104 return 'r' 105 106 107 def depth_color(depth): 108 """Return pyplot color for the queue depth.""" 109 if depth < 20: 110 return 'g' 111 if depth < 40: 112 return 'y' 113 return 'r' 114 115 116 def format_timedelta(delta): 117 """Return XdYhZm string representing timedelta.""" 118 return '%dd%dh%dm' % ( 119 delta.days, delta.seconds / 3600, (delta.seconds % 3600) / 60) 120 121 122 class Sampler(object): # pylint: disable=too-few-public-methods 123 """Track mean and total for a given window of items.""" 124 mean = 0 125 total = 0 126 127 def __init__(self, maxlen=60*24): 128 self.maxlen = maxlen 129 self.samples = collections.deque() 130 131 def __iadd__(self, sample): 132 self.append(sample) 133 return self 134 135 def append(self, sample): 136 """Append a sample, updating total and mean, dropping old samples.""" 137 self.samples.append(sample) 138 self.total += sample 139 while len(self.samples) > self.maxlen: 140 self.total -= self.samples.popleft() 141 self.mean = float(self.total) / len(self.samples) 142 143 144 class Results(object): # pylint: disable=too-few-public-methods,too-many-instance-attributes 145 """Results processed from sq/history.txt""" 146 def __init__(self): 147 self.dts = [] 148 self.prs = [] 149 self.queued = [] 150 self.queue_avg = [] 151 self.happiness = { # Percenteage of last N days queue was unblocked 152 1: [], 153 14: [], 154 } 155 self.active_merge_rate = { # Merge rate when queue is active 156 1: [], 157 14: [], 158 } 159 self.merge_rate = { # Merge rate including when queue is empty 160 1: [], 161 14: [], 162 } 163 self.merges = [] 164 self.backlog = { # Queue time in hours during the past N days 165 1: [], 166 14: [], 167 } 168 169 self.blocked_intervals = [] 170 self.offline_intervals = [] 171 172 def append(self, tick, did_merge, pulls, queue, 173 real_merges, active_merges, happy_moments): 174 """Append a sample of results. 175 176 Args: 177 tick: datetime of this sample 178 did_merge: number of prs merged 179 pulls: number of open prs 180 queue: number of approved prs waiting for merge 181 real_merges: merge rate over various time periods 182 active_merges: merge rate when queue is active (full or merging) 183 happy_moments: window of when the queue has been unblocked. 184 """ 185 # pylint: disable=too-many-locals 186 # Make them steps instead of slopes. 187 if self.dts: 188 self.dts.append(tick) 189 190 # Append the previous value at the current time 191 # which makes all changes move at right angles. 192 for happy in self.happiness.values(): 193 happy.append(happy[-1]) 194 self.merges.append(did_merge) # ??? 195 self.prs.append(self.prs[-1]) 196 self.queued.append(self.queued[-1]) 197 self.queue_avg.append(self.queue_avg[-1]) 198 for val in self.merge_rate.values(): 199 val.append(val[-1]) 200 for val in self.active_merge_rate.values(): 201 val.append(val[-1]) 202 self.dts.append(tick) 203 for days, happy in self.happiness.items(): 204 happy.append(happy_moments[days].mean) 205 self.merges.append(did_merge) 206 self.prs.append(pulls) 207 self.queued.append(queue) 208 weeks2 = 14*24*60 209 avg = mean(self.queued[-weeks2:]) 210 self.queue_avg.append(avg) 211 for days in self.merge_rate: 212 self.merge_rate[days].append(real_merges[days].total / float(days)) 213 for days in self.active_merge_rate: 214 self.active_merge_rate[days].append(active_merges[days].total / float(days)) 215 for dur, items in self.backlog.items(): 216 dur = 60*24*dur 217 if items: 218 items.append(items[-1]) 219 dur_merges = sum(self.merges[-dur:]) * 24.0 220 if dur_merges: 221 items.append(sum(self.queued[-dur:]) / dur_merges) 222 elif items: 223 items.append(items[-1]) 224 else: 225 items.append(0) 226 227 228 229 def output(history_lines, results): # pylint: disable=too-many-locals,too-many-branches 230 """Read historical data and return processed info.""" 231 real_merges = { 232 1: Sampler(), 233 14: Sampler(14*60*24), 234 } 235 active_merges = { 236 1: Sampler(), 237 14: Sampler(14*60*24), 238 } 239 happy_moments = {d: Sampler(d*60*24) for d in results.happiness} 240 241 tick = None 242 last_merge = 0 # Number of merges last sample, resets on queue restart 243 start_blocked = None 244 start_offline = None 245 246 for line in history_lines: 247 try: 248 tick, online, pulls, queue, dummy, blocked, merged = parse_line( 249 *line.strip().split(' ')) 250 except TypeError: # line does not fit expected criteria 251 continue 252 if tick < datetime.datetime.now() - datetime.timedelta(days=DAYS+14): 253 continue 254 if not pulls and not queue and not merged: # Bad sample 255 continue 256 257 if merged >= last_merge: 258 did_merge = merged - last_merge 259 elif online: # Restarts reset the number to 0 260 did_merge = merged 261 else: 262 did_merge = 0 263 264 last_merge = merged 265 for moments in happy_moments.values(): 266 moments.append(int(bool(online and not blocked))) 267 268 for val in real_merges.values(): 269 val += did_merge 270 if queue or did_merge: 271 for val in active_merges.values(): 272 val += did_merge 273 274 if not start_offline and not online: 275 start_offline = tick 276 if start_offline and online: 277 results.offline_intervals.append((start_offline, tick)) 278 start_offline = None 279 280 if not online: # Skip offline entries 281 continue 282 283 results.append( 284 tick, did_merge, pulls, queue, real_merges, active_merges, happy_moments) 285 286 if not start_blocked and blocked: 287 start_blocked = tick 288 if start_blocked and not blocked: 289 results.blocked_intervals.append((start_blocked, tick)) 290 start_blocked = None 291 if tick and not online: 292 tick = datetime.datetime.utcnow() 293 results.append( 294 tick, 0, pulls, queue, real_merges, active_merges, happy_moments) 295 if start_blocked: 296 results.blocked_intervals.append((start_blocked, tick)) 297 if start_offline: 298 results.offline_intervals.append((start_offline, tick)) 299 return results 300 301 302 def render_backlog(results, ax_backlog): 303 """Render how long items spend in the queue.""" 304 dts = results.dts 305 backlog = results.backlog 306 ax_backlog.yaxis.tick_right() 307 cur = backlog[1][-1] 308 color = backlog_color(cur) 309 p_day, = ax_backlog.plot(dts, backlog[1], '%s-' % color) 310 p_week, = ax_backlog.plot(dts, backlog[14], 'k:') 311 if max(backlog[1]) > 100 or max(backlog[14]) > 100: 312 ax_backlog.set_ylim([0, 100]) 313 ax_backlog.set_ylabel('Backlog') 314 ax_backlog.legend( 315 [p_day, p_week], 316 ['1d avg: %.1f hr wait' % cur, '14d avg: %.1f hr wait' % backlog[14][-1]], 317 'lower left', 318 fontsize='x-small', 319 ) 320 321 def render_merges(results, ax_merged): 322 """Render information about the merge rate.""" 323 dts = results.dts 324 ax_merged.yaxis.tick_right() 325 merge_rate = results.merge_rate 326 color = merge_color(results.active_merge_rate[1][-1]) 327 p_day, = ax_merged.plot(dts, merge_rate[1], '%s-' % color) 328 p_active, = ax_merged.plot(dts, results.active_merge_rate[1], '%s:' % color) 329 p_week, = ax_merged.plot(dts, merge_rate[14], 'k:') 330 ax_merged.set_ylabel('Merge rate') 331 ax_merged.legend( 332 [p_active, p_day, p_week], 333 ['active rate: %.1f PRs/day' % results.active_merge_rate[1][-1], 334 'real rate: %.1f PRs/day' % merge_rate[1][-1], 335 'real 14d avg: %.1f PRs/day' % merge_rate[14][-1]], 336 'lower left', 337 fontsize='x-small', 338 ) 339 340 341 def render_health(results, ax_health): 342 """Render the percentage of time the queue is healthy/online.""" 343 # pylint: disable=too-many-locals 344 dts = results.dts 345 happiness = results.happiness 346 ax_health.yaxis.tick_right() 347 348 health_color = '%s-' % happy_color(happiness[1][-1]) 349 p_1dhealth, = ax_health.plot(dts, happiness[1], health_color) 350 p_14dhealth, = ax_health.plot(dts, happiness[14], 'k:') 351 cur = 100 * happiness[1][-1] 352 cur14 = 100 * happiness[14][-1] 353 ax_health.set_ylabel('Unblocked %') 354 355 ax_health.set_ylim([0.0, 1.0]) 356 ax_health.set_xlim( 357 left=datetime.datetime.now() - datetime.timedelta(days=DAYS)) 358 359 for start, end in results.blocked_intervals: 360 ax_health.axvspan(start, end, alpha=0.2, color='brown', linewidth=0) 361 for start, end in results.offline_intervals: 362 ax_health.axvspan(start, end, alpha=0.2, color='black', linewidth=0) 363 364 patches = [ 365 p_1dhealth, 366 p_14dhealth, 367 mpatches.Patch(color='brown', label='blocked', alpha=0.2), 368 mpatches.Patch(color='black', label='offline', alpha=0.2), 369 ] 370 371 ax_health.legend( 372 patches, 373 ['1d avg: %.1f%%' % cur, '14d avg: %.1f%%' % cur14, 'blocked', 'offline'], 374 'lower left', 375 fontsize='x-small', 376 ) 377 378 379 def render_queue(results, ax_open): 380 """Render the queue graph (open prs, queued, prs).""" 381 dts = results.dts 382 prs = results.prs 383 queued = results.queued 384 queue_avg = results.queue_avg 385 ax_queued = ax_open.twinx() 386 p_open, = ax_open.plot(dts, prs, 'b-') 387 color_depth = depth_color(queued[-1]) 388 p_queue, = ax_queued.plot(dts, queued, color_depth) 389 p_14dqueue, = ax_queued.plot(dts, queue_avg, 'k:') 390 ax_queued.legend( 391 [p_open, p_queue, p_14dqueue], 392 [ 393 'open: %d PRs' % prs[-1], 394 'approved: %d PRs' % queued[-1], 395 '14d avg: %.1f PRs' % queue_avg[-1], 396 ], 397 'lower left', 398 fontsize='x-small', 399 ) 400 401 402 403 def render(results, out_file): 404 """Render three graphs to outfile from results.""" 405 fig, (ax_backlog, ax_merges, ax_open, ax_health) = plt.subplots( 406 4, sharex=True, figsize=(16, 10), dpi=100) 407 408 fig.autofmt_xdate() 409 plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) 410 plt.gca().xaxis.set_major_locator(mdates.DayLocator()) 411 if results.dts: 412 render_queue(results, ax_open) 413 render_merges(results, ax_merges) 414 render_backlog(results, ax_backlog) 415 render_health(results, ax_health) 416 fig.text( 417 0.1, 0.00, 418 'image: %s, sample: %s' % ( 419 datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M'), 420 results.dts[-1].strftime('%Y-%m-%d %H:%M'), 421 ), 422 horizontalalignment='left', 423 fontsize='x-small', 424 color=fresh_color(results.dts[-1]), 425 ) 426 427 plt.savefig(out_file, bbox_inches='tight', format='svg') 428 plt.close() 429 430 431 def render_forever(history_uri, img_uri, service_account=None): 432 """Download results from history_uri, render to svg and save to img_uri.""" 433 if service_account: 434 print >>sys.stderr, 'Activating service account using: %s' % ( 435 service_account) 436 subprocess.check_call([ 437 'gcloud', 438 'auth', 439 'activate-service-account', 440 '--key-file=%s' % service_account, 441 ]) 442 buf = cStringIO.StringIO() 443 while True: 444 print >>sys.stderr, 'Truncate render buffer' 445 buf.seek(0) 446 buf.truncate() 447 print >>sys.stderr, 'Cat latest results from %s...' % history_uri 448 try: 449 history = subprocess.check_output( 450 ['gsutil', '-q', 'cat', history_uri]) 451 except subprocess.CalledProcessError: 452 traceback.print_exc() 453 time.sleep(10) 454 continue 455 456 print >>sys.stderr, 'Render results to buffer...' 457 with gzip.GzipFile( 458 os.path.basename(img_uri), mode='wb', fileobj=buf) as compressed: 459 results = Results() 460 output(history.split('\n')[-60*24*DAYS:], results) # Last 21 days 461 render(results, compressed) 462 463 print >>sys.stderr, 'Copy buffer to %s...' % img_uri 464 proc = subprocess.Popen( 465 ['gsutil', '-q', 466 '-h', 'Content-Type:image/svg+xml', 467 '-h', 'Cache-Control:public, max-age=%d' % ( 468 60 if service_account else 5), 469 '-h', 'Content-Encoding:gzip', # GCS decompresses if necessary 470 'cp', '-a', 'public-read', '-', img_uri], 471 stdin=subprocess.PIPE) 472 proc.communicate(buf.getvalue()) 473 code = proc.wait() 474 if code: 475 print >>sys.stderr, 'Failed to copy rendering to %s: %d' % ( 476 img_uri, code) 477 time.sleep(60) 478 479 480 481 if __name__ == '__main__': 482 # log all arguments. 483 pprint.PrettyPrinter(stream=sys.stderr).pprint(sys.argv) 484 485 render_forever(*sys.argv[1:]) # pylint: disable=no-value-for-parameter