github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/queue_health/graph/graph.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Read historical samples of merge queue and plot them.""" 18 19 from __future__ import division 20 21 import collections 22 import cStringIO 23 import datetime 24 import gzip 25 import os 26 import pprint 27 import subprocess 28 import sys 29 import time 30 import traceback 31 32 # pylint: disable=import-error,wrong-import-position 33 import matplotlib 34 matplotlib.use('Agg') # For savefig 35 36 import matplotlib.dates as mdates 37 import matplotlib.patches as mpatches 38 import matplotlib.pyplot as plt 39 40 import numpy 41 # pylint: enable=wrong-import-position,import-error 42 43 DAYS = 21 # Graph this many days of history. 44 45 def mean(*a): 46 """Calculate the mean for items.""" 47 return numpy.mean(*a) # pylint: disable=no-value-for-parameter 48 49 def parse_line( 50 date, timenow, online, pulls, queue, 51 run, blocked, merge_count=0): # merge_count may be missing 52 """Parse a sq/history.txt line.""" 53 return ( 54 datetime.datetime.strptime( 55 '{} {}'.format(date, timenow), 56 '%Y-%m-%d %H:%M:%S.%f'), 57 online == 'True', # Merge queue is down/initializing 58 int(pulls), # Number of open PRs 59 int(queue), # PRs in the queue 60 int(run), # Totally worthless 61 blocked == 'True', # Cannot merge 62 int(merge_count), # Number of merges 63 ) 64 65 def fresh_color(tick): 66 """Return pyplot color for freshness of data.""" 67 if datetime.datetime.utcnow() - tick < datetime.timedelta(hours=1): 68 return 'k' # black 69 return 'r' 70 71 72 def merge_color(rate): 73 """Return pyplot color for merge rate.""" 74 if rate < 15: 75 return 'r' 76 if rate < 30: 77 return 'y' 78 return 'g' 79 80 81 def backlog_color(backlog): 82 """Return pyplot color for queue backlog.""" 83 if backlog < 5: 84 return 'g' 85 if backlog > 24: 86 return 'r' 87 return 'y' 88 89 90 def happy_color(health): 91 """Return pyplot color for health percentage.""" 92 if health > 0.8: 93 return 'g' 94 if health > 0.6: 95 return 'y' 96 return 'r' 97 98 99 def depth_color(depth): 100 """Return pyplot color for the queue depth.""" 101 if depth < 20: 102 return 'g' 103 if depth < 40: 104 return 'y' 105 return 'r' 106 107 108 def format_timedelta(delta): 109 """Return XdYhZm string representing timedelta.""" 110 return '%dd%dh%dm' % ( 111 delta.days, delta.seconds / 3600, (delta.seconds % 3600) / 60) 112 113 114 class Sampler(object): # pylint: disable=too-few-public-methods 115 """Track mean and total for a given window of items.""" 116 mean = 0 117 total = 0 118 119 def __init__(self, maxlen=60*24): 120 self.maxlen = maxlen 121 self.samples = collections.deque() 122 123 def __iadd__(self, sample): 124 self.append(sample) 125 return self 126 127 def append(self, sample): 128 """Append a sample, updating total and mean, dropping old samples.""" 129 self.samples.append(sample) 130 self.total += sample 131 while len(self.samples) > self.maxlen: 132 self.total -= self.samples.popleft() 133 self.mean = float(self.total) / len(self.samples) 134 135 136 class Results(object): # pylint: disable=too-few-public-methods,too-many-instance-attributes 137 """Results processed from sq/history.txt""" 138 def __init__(self): 139 self.dts = [] 140 self.prs = [] 141 self.queued = [] 142 self.queue_avg = [] 143 self.happiness = { # Percenteage of last N days queue was unblocked 144 1: [], 145 14: [], 146 } 147 self.active_merge_rate = { # Merge rate when queue is active 148 1: [], 149 14: [], 150 } 151 self.merge_rate = { # Merge rate including when queue is empty 152 1: [], 153 14: [], 154 } 155 self.merges = [] 156 self.backlog = { # Queue time in hours during the past N days 157 1: [], 158 14: [], 159 } 160 161 self.blocked_intervals = [] 162 self.offline_intervals = [] 163 164 def append(self, tick, did_merge, pulls, queue, 165 real_merges, active_merges, happy_moments): 166 """Append a sample of results. 167 168 Args: 169 tick: datetime of this sample 170 did_merge: number of prs merged 171 pulls: number of open prs 172 queue: number of approved prs waiting for merge 173 real_merges: merge rate over various time periods 174 active_merges: merge rate when queue is active (full or merging) 175 happy_moments: window of when the queue has been unblocked. 176 """ 177 # pylint: disable=too-many-locals 178 # Make them steps instead of slopes. 179 if self.dts: 180 self.dts.append(tick) 181 182 # Append the previous value at the current time 183 # which makes all changes move at right angles. 184 for happy in self.happiness.values(): 185 happy.append(happy[-1]) 186 self.merges.append(did_merge) # ??? 187 self.prs.append(self.prs[-1]) 188 self.queued.append(self.queued[-1]) 189 self.queue_avg.append(self.queue_avg[-1]) 190 for val in self.merge_rate.values(): 191 val.append(val[-1]) 192 for val in self.active_merge_rate.values(): 193 val.append(val[-1]) 194 self.dts.append(tick) 195 for days, happy in self.happiness.items(): 196 happy.append(happy_moments[days].mean) 197 self.merges.append(did_merge) 198 self.prs.append(pulls) 199 self.queued.append(queue) 200 weeks2 = 14*24*60 201 avg = mean(self.queued[-weeks2:]) 202 self.queue_avg.append(avg) 203 for days in self.merge_rate: 204 self.merge_rate[days].append(real_merges[days].total / float(days)) 205 for days in self.active_merge_rate: 206 self.active_merge_rate[days].append(active_merges[days].total / float(days)) 207 for dur, items in self.backlog.items(): 208 dur = 60*24*dur 209 if items: 210 items.append(items[-1]) 211 dur_merges = sum(self.merges[-dur:]) * 24.0 212 if dur_merges: 213 items.append(sum(self.queued[-dur:]) / dur_merges) 214 elif items: 215 items.append(items[-1]) 216 else: 217 items.append(0) 218 219 220 221 def output(history_lines, results): # pylint: disable=too-many-locals,too-many-branches 222 """Read historical data and return processed info.""" 223 real_merges = { 224 1: Sampler(), 225 14: Sampler(14*60*24), 226 } 227 active_merges = { 228 1: Sampler(), 229 14: Sampler(14*60*24), 230 } 231 happy_moments = {d: Sampler(d*60*24) for d in results.happiness} 232 233 tick = None 234 last_merge = 0 # Number of merges last sample, resets on queue restart 235 start_blocked = None 236 start_offline = None 237 238 for line in history_lines: 239 try: 240 tick, online, pulls, queue, dummy, blocked, merged = parse_line( 241 *line.strip().split(' ')) 242 except TypeError: # line does not fit expected criteria 243 continue 244 if tick < datetime.datetime.now() - datetime.timedelta(days=DAYS+14): 245 continue 246 if not pulls and not queue and not merged: # Bad sample 247 continue 248 249 if merged >= last_merge: 250 did_merge = merged - last_merge 251 elif online: # Restarts reset the number to 0 252 did_merge = merged 253 else: 254 did_merge = 0 255 256 last_merge = merged 257 for moments in happy_moments.values(): 258 moments.append(int(bool(online and not blocked))) 259 260 for val in real_merges.values(): 261 val += did_merge 262 if queue or did_merge: 263 for val in active_merges.values(): 264 val += did_merge 265 266 if not start_offline and not online: 267 start_offline = tick 268 if start_offline and online: 269 results.offline_intervals.append((start_offline, tick)) 270 start_offline = None 271 272 if not online: # Skip offline entries 273 continue 274 275 results.append( 276 tick, did_merge, pulls, queue, real_merges, active_merges, happy_moments) 277 278 if not start_blocked and blocked: 279 start_blocked = tick 280 if start_blocked and not blocked: 281 results.blocked_intervals.append((start_blocked, tick)) 282 start_blocked = None 283 if tick and not online: 284 tick = datetime.datetime.utcnow() 285 results.append( 286 tick, 0, pulls, queue, real_merges, active_merges, happy_moments) 287 if start_blocked: 288 results.blocked_intervals.append((start_blocked, tick)) 289 if start_offline: 290 results.offline_intervals.append((start_offline, tick)) 291 return results 292 293 294 def render_backlog(results, ax_backlog): 295 """Render how long items spend in the queue.""" 296 dts = results.dts 297 backlog = results.backlog 298 ax_backlog.yaxis.tick_right() 299 cur = backlog[1][-1] 300 color = backlog_color(cur) 301 p_day, = ax_backlog.plot(dts, backlog[1], '%s-' % color) 302 p_week, = ax_backlog.plot(dts, backlog[14], 'k:') 303 if max(backlog[1]) > 100 or max(backlog[14]) > 100: 304 ax_backlog.set_ylim([0, 100]) 305 ax_backlog.set_ylabel('Backlog') 306 ax_backlog.legend( 307 [p_day, p_week], 308 ['1d avg: %.1f hr wait' % cur, '14d avg: %.1f hr wait' % backlog[14][-1]], 309 'lower left', 310 fontsize='x-small', 311 ) 312 313 def render_merges(results, ax_merged): 314 """Render information about the merge rate.""" 315 dts = results.dts 316 ax_merged.yaxis.tick_right() 317 merge_rate = results.merge_rate 318 color = merge_color(results.active_merge_rate[1][-1]) 319 p_day, = ax_merged.plot(dts, merge_rate[1], '%s-' % color) 320 p_active, = ax_merged.plot(dts, results.active_merge_rate[1], '%s:' % color) 321 p_week, = ax_merged.plot(dts, merge_rate[14], 'k:') 322 ax_merged.set_ylabel('Merge rate') 323 ax_merged.legend( 324 [p_active, p_day, p_week], 325 ['active rate: %.1f PRs/day' % results.active_merge_rate[1][-1], 326 'real rate: %.1f PRs/day' % merge_rate[1][-1], 327 'real 14d avg: %.1f PRs/day' % merge_rate[14][-1]], 328 'lower left', 329 fontsize='x-small', 330 ) 331 332 333 def render_health(results, ax_health): 334 """Render the percentage of time the queue is healthy/online.""" 335 # pylint: disable=too-many-locals 336 dts = results.dts 337 happiness = results.happiness 338 ax_health.yaxis.tick_right() 339 340 health_color = '%s-' % happy_color(happiness[1][-1]) 341 p_1dhealth, = ax_health.plot(dts, happiness[1], health_color) 342 p_14dhealth, = ax_health.plot(dts, happiness[14], 'k:') 343 cur = 100 * happiness[1][-1] 344 cur14 = 100 * happiness[14][-1] 345 ax_health.set_ylabel('Unblocked %') 346 347 ax_health.set_ylim([0.0, 1.0]) 348 ax_health.set_xlim( 349 left=datetime.datetime.now() - datetime.timedelta(days=DAYS)) 350 351 for start, end in results.blocked_intervals: 352 ax_health.axvspan(start, end, alpha=0.2, color='brown', linewidth=0) 353 for start, end in results.offline_intervals: 354 ax_health.axvspan(start, end, alpha=0.2, color='black', linewidth=0) 355 356 patches = [ 357 p_1dhealth, 358 p_14dhealth, 359 mpatches.Patch(color='brown', label='blocked', alpha=0.2), 360 mpatches.Patch(color='black', label='offline', alpha=0.2), 361 ] 362 363 ax_health.legend( 364 patches, 365 ['1d avg: %.1f%%' % cur, '14d avg: %.1f%%' % cur14, 'blocked', 'offline'], 366 'lower left', 367 fontsize='x-small', 368 ) 369 370 371 def render_queue(results, ax_open): 372 """Render the queue graph (open prs, queued, prs).""" 373 dts = results.dts 374 prs = results.prs 375 queued = results.queued 376 queue_avg = results.queue_avg 377 ax_queued = ax_open.twinx() 378 p_open, = ax_open.plot(dts, prs, 'b-') 379 color_depth = depth_color(queued[-1]) 380 p_queue, = ax_queued.plot(dts, queued, color_depth) 381 p_14dqueue, = ax_queued.plot(dts, queue_avg, 'k:') 382 ax_queued.legend( 383 [p_open, p_queue, p_14dqueue], 384 [ 385 'open: %d PRs' % prs[-1], 386 'approved: %d PRs' % queued[-1], 387 '14d avg: %.1f PRs' % queue_avg[-1], 388 ], 389 'lower left', 390 fontsize='x-small', 391 ) 392 393 394 395 def render(results, out_file): 396 """Render three graphs to outfile from results.""" 397 fig, (ax_backlog, ax_merges, ax_open, ax_health) = plt.subplots( 398 4, sharex=True, figsize=(16, 10), dpi=100) 399 400 fig.autofmt_xdate() 401 plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) 402 plt.gca().xaxis.set_major_locator(mdates.DayLocator()) 403 if results.dts: 404 render_queue(results, ax_open) 405 render_merges(results, ax_merges) 406 render_backlog(results, ax_backlog) 407 render_health(results, ax_health) 408 fig.text( 409 0.1, 0.00, 410 'image: %s, sample: %s' % ( 411 datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M'), 412 results.dts[-1].strftime('%Y-%m-%d %H:%M'), 413 ), 414 horizontalalignment='left', 415 fontsize='x-small', 416 color=fresh_color(results.dts[-1]), 417 ) 418 419 plt.savefig(out_file, bbox_inches='tight', format='svg') 420 plt.close() 421 422 423 def render_forever(history_uri, img_uri, service_account=None): 424 """Download results from history_uri, render to svg and save to img_uri.""" 425 if service_account: 426 print >>sys.stderr, 'Activating service account using: %s' % ( 427 service_account) 428 subprocess.check_call([ 429 'gcloud', 430 'auth', 431 'activate-service-account', 432 '--key-file=%s' % service_account, 433 ]) 434 buf = cStringIO.StringIO() 435 while True: 436 print >>sys.stderr, 'Truncate render buffer' 437 buf.seek(0) 438 buf.truncate() 439 print >>sys.stderr, 'Cat latest results from %s...' % history_uri 440 try: 441 history = subprocess.check_output( 442 ['gsutil', '-q', 'cat', history_uri]) 443 except subprocess.CalledProcessError: 444 traceback.print_exc() 445 time.sleep(10) 446 continue 447 448 print >>sys.stderr, 'Render results to buffer...' 449 with gzip.GzipFile( 450 os.path.basename(img_uri), mode='wb', fileobj=buf) as compressed: 451 results = Results() 452 output(history.split('\n')[-60*24*DAYS:], results) # Last 21 days 453 render(results, compressed) 454 455 print >>sys.stderr, 'Copy buffer to %s...' % img_uri 456 proc = subprocess.Popen( 457 ['gsutil', '-q', 458 '-h', 'Content-Type:image/svg+xml', 459 '-h', 'Cache-Control:public, max-age=%d' % ( 460 60 if service_account else 5), 461 '-h', 'Content-Encoding:gzip', # GCS decompresses if necessary 462 'cp', '-a', 'public-read', '-', img_uri], 463 stdin=subprocess.PIPE) 464 proc.communicate(buf.getvalue()) 465 code = proc.wait() 466 if code: 467 print >>sys.stderr, 'Failed to copy rendering to %s: %d' % ( 468 img_uri, code) 469 time.sleep(60) 470 471 472 473 if __name__ == '__main__': 474 # log all arguments. 475 pprint.PrettyPrinter(stream=sys.stderr).pprint(sys.argv) 476 477 render_forever(*sys.argv[1:]) # pylint: disable=no-value-for-parameter