github.com/abayer/test-infra@v0.0.5/queue_health/graph/graph.py

github.com/abayer/test-infra@v0.0.5/queue_health/graph/graph.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Read historical samples of merge queue and plot them."""
    18  
    19  from __future__ import division
    20  
    21  import collections
    22  import cStringIO
    23  import datetime
    24  import gzip
    25  import os
    26  import pprint
    27  import subprocess
    28  import sys
    29  import time
    30  import traceback
    31  
    32  # pylint: disable=import-error,wrong-import-position
    33  try:
    34      import matplotlib
    35      matplotlib.use('Agg')  # For savefig
    36  
    37      import matplotlib.dates as mdates
    38      import matplotlib.patches as mpatches
    39      import matplotlib.pyplot as plt
    40  
    41      import numpy
    42  except ImportError:
    43      # TODO(fejta): figure out how to add matplotlib and numpy to the bazel
    44      # workspace. Until then hack around this for unit testing.
    45      # pylint: disable=invalid-name
    46      numpy = mdates = mpatches = plt = NotImplementedError
    47  # pylint: enable=wrong-import-position,import-error
    48  
    49  DAYS = 21  # Graph this many days of history.
    50  
    51  def mean(*a):
    52      """Calculate the mean for items."""
    53      return numpy.mean(*a)  # pylint: disable=no-value-for-parameter
    54  
    55  def parse_line(
    56          date, timenow, online, pulls, queue,
    57          run, blocked, merge_count=0):  # merge_count may be missing
    58      """Parse a sq/history.txt line."""
    59      if '.' not in timenow:
    60          timenow = '%s.0' % timenow
    61      return (
    62          datetime.datetime.strptime(
    63              '%s %s' % (date, timenow),
    64              '%Y-%m-%d %H:%M:%S.%f'),
    65          online == 'True',  # Merge queue is down/initializing
    66          int(pulls),  # Number of open PRs
    67          int(queue),  # PRs in the queue
    68          int(run),  # Totally worthless
    69          blocked == 'True',  # Cannot merge
    70          int(merge_count),  # Number of merges
    71      )
    72  
    73  def fresh_color(tick):
    74      """Return pyplot color for freshness of data."""
    75      if datetime.datetime.utcnow() - tick < datetime.timedelta(hours=1):
    76          return 'k'  # black
    77      return 'r'
    78  
    79  
    80  def merge_color(rate):
    81      """Return pyplot color for merge rate."""
    82      if rate < 15:
    83          return 'r'
    84      if rate < 30:
    85          return 'y'
    86      return 'g'
    87  
    88  
    89  def backlog_color(backlog):
    90      """Return pyplot color for queue backlog."""
    91      if backlog < 5:
    92          return 'g'
    93      if backlog > 24:
    94          return 'r'
    95      return 'y'
    96  
    97  
    98  def happy_color(health):
    99      """Return pyplot color for health percentage."""
   100      if health > 0.8:
   101          return 'g'
   102      if health > 0.6:
   103          return 'y'
   104      return 'r'
   105  
   106  
   107  def depth_color(depth):
   108      """Return pyplot color for the queue depth."""
   109      if depth < 20:
   110          return 'g'
   111      if depth < 40:
   112          return 'y'
   113      return 'r'
   114  
   115  
   116  def format_timedelta(delta):
   117      """Return XdYhZm string representing timedelta."""
   118      return '%dd%dh%dm' % (
   119          delta.days, delta.seconds / 3600, (delta.seconds % 3600) / 60)
   120  
   121  
   122  class Sampler(object):  # pylint: disable=too-few-public-methods
   123      """Track mean and total for a given window of items."""
   124      mean = 0
   125      total = 0
   126  
   127      def __init__(self, maxlen=60*24):
   128          self.maxlen = maxlen
   129          self.samples = collections.deque()
   130  
   131      def __iadd__(self, sample):
   132          self.append(sample)
   133          return self
   134  
   135      def append(self, sample):
   136          """Append a sample, updating total and mean, dropping old samples."""
   137          self.samples.append(sample)
   138          self.total += sample
   139          while len(self.samples) > self.maxlen:
   140              self.total -= self.samples.popleft()
   141          self.mean = float(self.total) / len(self.samples)
   142  
   143  
   144  class Results(object):  # pylint: disable=too-few-public-methods,too-many-instance-attributes
   145      """Results processed from sq/history.txt"""
   146      def __init__(self):
   147          self.dts = []
   148          self.prs = []
   149          self.queued = []
   150          self.queue_avg = []
   151          self.happiness = {  # Percenteage of last N days queue was unblocked
   152              1: [],
   153              14: [],
   154          }
   155          self.active_merge_rate = {  # Merge rate when queue is active
   156              1: [],
   157              14: [],
   158          }
   159          self.merge_rate = {  # Merge rate including when queue is empty
   160              1: [],
   161              14: [],
   162          }
   163          self.merges = []
   164          self.backlog = {  # Queue time in hours during the past N days
   165              1: [],
   166              14: [],
   167          }
   168  
   169          self.blocked_intervals = []
   170          self.offline_intervals = []
   171  
   172      def append(self, tick, did_merge, pulls, queue,
   173                 real_merges, active_merges, happy_moments):
   174          """Append a sample of results.
   175  
   176          Args:
   177            tick: datetime of this sample
   178            did_merge: number of prs merged
   179            pulls: number of open prs
   180            queue: number of approved prs waiting for merge
   181            real_merges: merge rate over various time periods
   182            active_merges: merge rate when queue is active (full or merging)
   183            happy_moments: window of when the queue has been unblocked.
   184          """
   185          # pylint: disable=too-many-locals
   186          # Make them steps instead of slopes.
   187          if self.dts:
   188              self.dts.append(tick)
   189  
   190              # Append the previous value at the current time
   191              # which makes all changes move at right angles.
   192              for happy in self.happiness.values():
   193                  happy.append(happy[-1])
   194              self.merges.append(did_merge)  # ???
   195              self.prs.append(self.prs[-1])
   196              self.queued.append(self.queued[-1])
   197              self.queue_avg.append(self.queue_avg[-1])
   198              for val in self.merge_rate.values():
   199                  val.append(val[-1])
   200              for val in self.active_merge_rate.values():
   201                  val.append(val[-1])
   202          self.dts.append(tick)
   203          for days, happy in self.happiness.items():
   204              happy.append(happy_moments[days].mean)
   205          self.merges.append(did_merge)
   206          self.prs.append(pulls)
   207          self.queued.append(queue)
   208          weeks2 = 14*24*60
   209          avg = mean(self.queued[-weeks2:])
   210          self.queue_avg.append(avg)
   211          for days in self.merge_rate:
   212              self.merge_rate[days].append(real_merges[days].total / float(days))
   213          for days in self.active_merge_rate:
   214              self.active_merge_rate[days].append(active_merges[days].total / float(days))
   215          for dur, items in self.backlog.items():
   216              dur = 60*24*dur
   217              if items:
   218                  items.append(items[-1])
   219              dur_merges = sum(self.merges[-dur:]) * 24.0
   220              if dur_merges:
   221                  items.append(sum(self.queued[-dur:]) / dur_merges)
   222              elif items:
   223                  items.append(items[-1])
   224              else:
   225                  items.append(0)
   226  
   227  
   228  
   229  def output(history_lines, results):  # pylint: disable=too-many-locals,too-many-branches
   230      """Read historical data and return processed info."""
   231      real_merges = {
   232          1: Sampler(),
   233          14: Sampler(14*60*24),
   234      }
   235      active_merges = {
   236          1: Sampler(),
   237          14: Sampler(14*60*24),
   238      }
   239      happy_moments = {d: Sampler(d*60*24) for d in results.happiness}
   240  
   241      tick = None
   242      last_merge = 0  # Number of merges last sample, resets on queue restart
   243      start_blocked = None
   244      start_offline = None
   245  
   246      for line in history_lines:
   247          try:
   248              tick, online, pulls, queue, dummy, blocked, merged = parse_line(
   249                  *line.strip().split(' '))
   250          except TypeError:  # line does not fit expected criteria
   251              continue
   252          if tick < datetime.datetime.now() - datetime.timedelta(days=DAYS+14):
   253              continue
   254          if not pulls and not queue and not merged:  # Bad sample
   255              continue
   256  
   257          if merged >= last_merge:
   258              did_merge = merged - last_merge
   259          elif online:  # Restarts reset the number to 0
   260              did_merge = merged
   261          else:
   262              did_merge = 0
   263  
   264          last_merge = merged
   265          for moments in happy_moments.values():
   266              moments.append(int(bool(online and not blocked)))
   267  
   268          for val in real_merges.values():
   269              val += did_merge
   270          if queue or did_merge:
   271              for val in active_merges.values():
   272                  val += did_merge
   273  
   274          if not start_offline and not online:
   275              start_offline = tick
   276          if start_offline and online:
   277              results.offline_intervals.append((start_offline, tick))
   278              start_offline = None
   279  
   280          if not online:  # Skip offline entries
   281              continue
   282  
   283          results.append(
   284              tick, did_merge, pulls, queue, real_merges, active_merges, happy_moments)
   285  
   286          if not start_blocked and blocked:
   287              start_blocked = tick
   288          if start_blocked and not blocked:
   289              results.blocked_intervals.append((start_blocked, tick))
   290              start_blocked = None
   291      if tick and not online:
   292          tick = datetime.datetime.utcnow()
   293          results.append(
   294              tick, 0, pulls, queue, real_merges, active_merges, happy_moments)
   295      if start_blocked:
   296          results.blocked_intervals.append((start_blocked, tick))
   297      if start_offline:
   298          results.offline_intervals.append((start_offline, tick))
   299      return results
   300  
   301  
   302  def render_backlog(results, ax_backlog):
   303      """Render how long items spend in the queue."""
   304      dts = results.dts
   305      backlog = results.backlog
   306      ax_backlog.yaxis.tick_right()
   307      cur = backlog[1][-1]
   308      color = backlog_color(cur)
   309      p_day, = ax_backlog.plot(dts, backlog[1], '%s-' % color)
   310      p_week, = ax_backlog.plot(dts, backlog[14], 'k:')
   311      if max(backlog[1]) > 100 or max(backlog[14]) > 100:
   312          ax_backlog.set_ylim([0, 100])
   313      ax_backlog.set_ylabel('Backlog')
   314      ax_backlog.legend(
   315          [p_day, p_week],
   316          ['1d avg: %.1f hr wait' % cur, '14d avg: %.1f hr wait' % backlog[14][-1]],
   317          'lower left',
   318          fontsize='x-small',
   319      )
   320  
   321  def render_merges(results, ax_merged):
   322      """Render information about the merge rate."""
   323      dts = results.dts
   324      ax_merged.yaxis.tick_right()
   325      merge_rate = results.merge_rate
   326      color = merge_color(results.active_merge_rate[1][-1])
   327      p_day, = ax_merged.plot(dts, merge_rate[1], '%s-' % color)
   328      p_active, = ax_merged.plot(dts, results.active_merge_rate[1], '%s:' % color)
   329      p_week, = ax_merged.plot(dts, merge_rate[14], 'k:')
   330      ax_merged.set_ylabel('Merge rate')
   331      ax_merged.legend(
   332          [p_active, p_day, p_week],
   333          ['active rate: %.1f PRs/day' % results.active_merge_rate[1][-1],
   334           'real rate: %.1f PRs/day' % merge_rate[1][-1],
   335           'real 14d avg: %.1f PRs/day' % merge_rate[14][-1]],
   336          'lower left',
   337          fontsize='x-small',
   338      )
   339  
   340  
   341  def render_health(results, ax_health):
   342      """Render the percentage of time the queue is healthy/online."""
   343      # pylint: disable=too-many-locals
   344      dts = results.dts
   345      happiness = results.happiness
   346      ax_health.yaxis.tick_right()
   347  
   348      health_color = '%s-' % happy_color(happiness[1][-1])
   349      p_1dhealth, = ax_health.plot(dts, happiness[1], health_color)
   350      p_14dhealth, = ax_health.plot(dts, happiness[14], 'k:')
   351      cur = 100 * happiness[1][-1]
   352      cur14 = 100 * happiness[14][-1]
   353      ax_health.set_ylabel('Unblocked %')
   354  
   355      ax_health.set_ylim([0.0, 1.0])
   356      ax_health.set_xlim(
   357          left=datetime.datetime.now() - datetime.timedelta(days=DAYS))
   358  
   359      for start, end in results.blocked_intervals:
   360          ax_health.axvspan(start, end, alpha=0.2, color='brown', linewidth=0)
   361      for start, end in results.offline_intervals:
   362          ax_health.axvspan(start, end, alpha=0.2, color='black', linewidth=0)
   363  
   364      patches = [
   365          p_1dhealth,
   366          p_14dhealth,
   367          mpatches.Patch(color='brown', label='blocked', alpha=0.2),
   368          mpatches.Patch(color='black', label='offline', alpha=0.2),
   369      ]
   370  
   371      ax_health.legend(
   372          patches,
   373          ['1d avg: %.1f%%' % cur, '14d avg: %.1f%%' % cur14, 'blocked', 'offline'],
   374          'lower left',
   375          fontsize='x-small',
   376      )
   377  
   378  
   379  def render_queue(results, ax_open):
   380      """Render the queue graph (open prs, queued, prs)."""
   381      dts = results.dts
   382      prs = results.prs
   383      queued = results.queued
   384      queue_avg = results.queue_avg
   385      ax_queued = ax_open.twinx()
   386      p_open, = ax_open.plot(dts, prs, 'b-')
   387      color_depth = depth_color(queued[-1])
   388      p_queue, = ax_queued.plot(dts, queued, color_depth)
   389      p_14dqueue, = ax_queued.plot(dts, queue_avg, 'k:')
   390      ax_queued.legend(
   391          [p_open, p_queue, p_14dqueue],
   392          [
   393              'open: %d PRs' % prs[-1],
   394              'approved: %d PRs' % queued[-1],
   395              '14d avg: %.1f PRs' % queue_avg[-1],
   396          ],
   397          'lower left',
   398          fontsize='x-small',
   399      )
   400  
   401  
   402  
   403  def render(results, out_file):
   404      """Render three graphs to outfile from results."""
   405      fig, (ax_backlog, ax_merges, ax_open, ax_health) = plt.subplots(
   406          4, sharex=True, figsize=(16, 10), dpi=100)
   407  
   408      fig.autofmt_xdate()
   409      plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
   410      plt.gca().xaxis.set_major_locator(mdates.DayLocator())
   411      if results.dts:
   412          render_queue(results, ax_open)
   413          render_merges(results, ax_merges)
   414          render_backlog(results, ax_backlog)
   415          render_health(results, ax_health)
   416          fig.text(
   417              0.1, 0.00,
   418              'image: %s, sample: %s' % (
   419                  datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M'),
   420                  results.dts[-1].strftime('%Y-%m-%d %H:%M'),
   421              ),
   422              horizontalalignment='left',
   423              fontsize='x-small',
   424              color=fresh_color(results.dts[-1]),
   425          )
   426  
   427      plt.savefig(out_file, bbox_inches='tight', format='svg')
   428      plt.close()
   429  
   430  
   431  def render_forever(history_uri, img_uri, service_account=None):
   432      """Download results from history_uri, render to svg and save to img_uri."""
   433      if service_account:
   434          print >>sys.stderr, 'Activating service account using: %s' % (
   435              service_account)
   436          subprocess.check_call([
   437              'gcloud',
   438              'auth',
   439              'activate-service-account',
   440              '--key-file=%s' % service_account,
   441          ])
   442      buf = cStringIO.StringIO()
   443      while True:
   444          print >>sys.stderr, 'Truncate render buffer'
   445          buf.seek(0)
   446          buf.truncate()
   447          print >>sys.stderr, 'Cat latest results from %s...' % history_uri
   448          try:
   449              history = subprocess.check_output(
   450                  ['gsutil', '-q', 'cat', history_uri])
   451          except subprocess.CalledProcessError:
   452              traceback.print_exc()
   453              time.sleep(10)
   454              continue
   455  
   456          print >>sys.stderr, 'Render results to buffer...'
   457          with gzip.GzipFile(
   458              os.path.basename(img_uri), mode='wb', fileobj=buf) as compressed:
   459              results = Results()
   460              output(history.split('\n')[-60*24*DAYS:], results)  # Last 21 days
   461              render(results, compressed)
   462  
   463          print >>sys.stderr, 'Copy buffer to %s...' % img_uri
   464          proc = subprocess.Popen(
   465              ['gsutil', '-q',
   466               '-h', 'Content-Type:image/svg+xml',
   467               '-h', 'Cache-Control:public, max-age=%d' % (
   468                   60 if service_account else 5),
   469               '-h', 'Content-Encoding:gzip',  # GCS decompresses if necessary
   470               'cp', '-a', 'public-read', '-', img_uri],
   471              stdin=subprocess.PIPE)
   472          proc.communicate(buf.getvalue())
   473          code = proc.wait()
   474          if code:
   475              print >>sys.stderr, 'Failed to copy rendering to %s: %d' % (
   476                  img_uri, code)
   477          time.sleep(60)
   478  
   479  
   480  
   481  if __name__ == '__main__':
   482      # log all arguments.
   483      pprint.PrettyPrinter(stream=sys.stderr).pprint(sys.argv)
   484  
   485      render_forever(*sys.argv[1:])  # pylint: disable=no-value-for-parameter