github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/queue_health/graph/graph.py

github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/queue_health/graph/graph.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Read historical samples of merge queue and plot them."""
    18  
    19  from __future__ import division
    20  
    21  import collections
    22  import cStringIO
    23  import datetime
    24  import gzip
    25  import os
    26  import pprint
    27  import subprocess
    28  import sys
    29  import time
    30  import traceback
    31  
    32  # pylint: disable=import-error,wrong-import-position
    33  import matplotlib
    34  matplotlib.use('Agg')  # For savefig
    35  
    36  import matplotlib.dates as mdates
    37  import matplotlib.patches as mpatches
    38  import matplotlib.pyplot as plt
    39  
    40  import numpy
    41  # pylint: enable=wrong-import-position,import-error
    42  
    43  DAYS = 21  # Graph this many days of history.
    44  
    45  def mean(*a):
    46      """Calculate the mean for items."""
    47      return numpy.mean(*a)  # pylint: disable=no-value-for-parameter
    48  
    49  def parse_line(
    50          date, timenow, online, pulls, queue,
    51          run, blocked, merge_count=0):  # merge_count may be missing
    52      """Parse a sq/history.txt line."""
    53      return (
    54          datetime.datetime.strptime(
    55              '{} {}'.format(date, timenow),
    56              '%Y-%m-%d %H:%M:%S.%f'),
    57          online == 'True',  # Merge queue is down/initializing
    58          int(pulls),  # Number of open PRs
    59          int(queue),  # PRs in the queue
    60          int(run),  # Totally worthless
    61          blocked == 'True',  # Cannot merge
    62          int(merge_count),  # Number of merges
    63      )
    64  
    65  def fresh_color(tick):
    66      """Return pyplot color for freshness of data."""
    67      if datetime.datetime.utcnow() - tick < datetime.timedelta(hours=1):
    68          return 'k'  # black
    69      return 'r'
    70  
    71  
    72  def merge_color(rate):
    73      """Return pyplot color for merge rate."""
    74      if rate < 15:
    75          return 'r'
    76      if rate < 30:
    77          return 'y'
    78      return 'g'
    79  
    80  
    81  def backlog_color(backlog):
    82      """Return pyplot color for queue backlog."""
    83      if backlog < 5:
    84          return 'g'
    85      if backlog > 24:
    86          return 'r'
    87      return 'y'
    88  
    89  
    90  def happy_color(health):
    91      """Return pyplot color for health percentage."""
    92      if health > 0.8:
    93          return 'g'
    94      if health > 0.6:
    95          return 'y'
    96      return 'r'
    97  
    98  
    99  def depth_color(depth):
   100      """Return pyplot color for the queue depth."""
   101      if depth < 20:
   102          return 'g'
   103      if depth < 40:
   104          return 'y'
   105      return 'r'
   106  
   107  
   108  def format_timedelta(delta):
   109      """Return XdYhZm string representing timedelta."""
   110      return '%dd%dh%dm' % (
   111          delta.days, delta.seconds / 3600, (delta.seconds % 3600) / 60)
   112  
   113  
   114  class Sampler(object):  # pylint: disable=too-few-public-methods
   115      """Track mean and total for a given window of items."""
   116      mean = 0
   117      total = 0
   118  
   119      def __init__(self, maxlen=60*24):
   120          self.maxlen = maxlen
   121          self.samples = collections.deque()
   122  
   123      def __iadd__(self, sample):
   124          self.append(sample)
   125          return self
   126  
   127      def append(self, sample):
   128          """Append a sample, updating total and mean, dropping old samples."""
   129          self.samples.append(sample)
   130          self.total += sample
   131          while len(self.samples) > self.maxlen:
   132              self.total -= self.samples.popleft()
   133          self.mean = float(self.total) / len(self.samples)
   134  
   135  
   136  class Results(object):  # pylint: disable=too-few-public-methods,too-many-instance-attributes
   137      """Results processed from sq/history.txt"""
   138      def __init__(self):
   139          self.dts = []
   140          self.prs = []
   141          self.queued = []
   142          self.queue_avg = []
   143          self.happiness = {  # Percenteage of last N days queue was unblocked
   144              1: [],
   145              14: [],
   146          }
   147          self.active_merge_rate = {  # Merge rate when queue is active
   148              1: [],
   149              14: [],
   150          }
   151          self.merge_rate = {  # Merge rate including when queue is empty
   152              1: [],
   153              14: [],
   154          }
   155          self.merges = []
   156          self.backlog = {  # Queue time in hours during the past N days
   157              1: [],
   158              14: [],
   159          }
   160  
   161          self.blocked_intervals = []
   162          self.offline_intervals = []
   163  
   164      def append(self, tick, did_merge, pulls, queue,
   165                 real_merges, active_merges, happy_moments):
   166          """Append a sample of results.
   167  
   168          Args:
   169            tick: datetime of this sample
   170            did_merge: number of prs merged
   171            pulls: number of open prs
   172            queue: number of approved prs waiting for merge
   173            real_merges: merge rate over various time periods
   174            active_merges: merge rate when queue is active (full or merging)
   175            happy_moments: window of when the queue has been unblocked.
   176          """
   177          # pylint: disable=too-many-locals
   178          # Make them steps instead of slopes.
   179          if self.dts:
   180              self.dts.append(tick)
   181  
   182              # Append the previous value at the current time
   183              # which makes all changes move at right angles.
   184              for happy in self.happiness.values():
   185                  happy.append(happy[-1])
   186              self.merges.append(did_merge)  # ???
   187              self.prs.append(self.prs[-1])
   188              self.queued.append(self.queued[-1])
   189              self.queue_avg.append(self.queue_avg[-1])
   190              for val in self.merge_rate.values():
   191                  val.append(val[-1])
   192              for val in self.active_merge_rate.values():
   193                  val.append(val[-1])
   194          self.dts.append(tick)
   195          for days, happy in self.happiness.items():
   196              happy.append(happy_moments[days].mean)
   197          self.merges.append(did_merge)
   198          self.prs.append(pulls)
   199          self.queued.append(queue)
   200          weeks2 = 14*24*60
   201          avg = mean(self.queued[-weeks2:])
   202          self.queue_avg.append(avg)
   203          for days in self.merge_rate:
   204              self.merge_rate[days].append(real_merges[days].total / float(days))
   205          for days in self.active_merge_rate:
   206              self.active_merge_rate[days].append(active_merges[days].total / float(days))
   207          for dur, items in self.backlog.items():
   208              dur = 60*24*dur
   209              if items:
   210                  items.append(items[-1])
   211              dur_merges = sum(self.merges[-dur:]) * 24.0
   212              if dur_merges:
   213                  items.append(sum(self.queued[-dur:]) / dur_merges)
   214              elif items:
   215                  items.append(items[-1])
   216              else:
   217                  items.append(0)
   218  
   219  
   220  
   221  def output(history_lines, results):  # pylint: disable=too-many-locals,too-many-branches
   222      """Read historical data and return processed info."""
   223      real_merges = {
   224          1: Sampler(),
   225          14: Sampler(14*60*24),
   226      }
   227      active_merges = {
   228          1: Sampler(),
   229          14: Sampler(14*60*24),
   230      }
   231      happy_moments = {d: Sampler(d*60*24) for d in results.happiness}
   232  
   233      tick = None
   234      last_merge = 0  # Number of merges last sample, resets on queue restart
   235      start_blocked = None
   236      start_offline = None
   237  
   238      for line in history_lines:
   239          try:
   240              tick, online, pulls, queue, dummy, blocked, merged = parse_line(
   241                  *line.strip().split(' '))
   242          except TypeError:  # line does not fit expected criteria
   243              continue
   244          if tick < datetime.datetime.now() - datetime.timedelta(days=DAYS+14):
   245              continue
   246          if not pulls and not queue and not merged:  # Bad sample
   247              continue
   248  
   249          if merged >= last_merge:
   250              did_merge = merged - last_merge
   251          elif online:  # Restarts reset the number to 0
   252              did_merge = merged
   253          else:
   254              did_merge = 0
   255  
   256          last_merge = merged
   257          for moments in happy_moments.values():
   258              moments.append(int(bool(online and not blocked)))
   259  
   260          for val in real_merges.values():
   261              val += did_merge
   262          if queue or did_merge:
   263              for val in active_merges.values():
   264                  val += did_merge
   265  
   266          if not start_offline and not online:
   267              start_offline = tick
   268          if start_offline and online:
   269              results.offline_intervals.append((start_offline, tick))
   270              start_offline = None
   271  
   272          if not online:  # Skip offline entries
   273              continue
   274  
   275          results.append(
   276              tick, did_merge, pulls, queue, real_merges, active_merges, happy_moments)
   277  
   278          if not start_blocked and blocked:
   279              start_blocked = tick
   280          if start_blocked and not blocked:
   281              results.blocked_intervals.append((start_blocked, tick))
   282              start_blocked = None
   283      if tick and not online:
   284          tick = datetime.datetime.utcnow()
   285          results.append(
   286              tick, 0, pulls, queue, real_merges, active_merges, happy_moments)
   287      if start_blocked:
   288          results.blocked_intervals.append((start_blocked, tick))
   289      if start_offline:
   290          results.offline_intervals.append((start_offline, tick))
   291      return results
   292  
   293  
   294  def render_backlog(results, ax_backlog):
   295      """Render how long items spend in the queue."""
   296      dts = results.dts
   297      backlog = results.backlog
   298      ax_backlog.yaxis.tick_right()
   299      cur = backlog[1][-1]
   300      color = backlog_color(cur)
   301      p_day, = ax_backlog.plot(dts, backlog[1], '%s-' % color)
   302      p_week, = ax_backlog.plot(dts, backlog[14], 'k:')
   303      if max(backlog[1]) > 100 or max(backlog[14]) > 100:
   304          ax_backlog.set_ylim([0, 100])
   305      ax_backlog.set_ylabel('Backlog')
   306      ax_backlog.legend(
   307          [p_day, p_week],
   308          ['1d avg: %.1f hr wait' % cur, '14d avg: %.1f hr wait' % backlog[14][-1]],
   309          'lower left',
   310          fontsize='x-small',
   311      )
   312  
   313  def render_merges(results, ax_merged):
   314      """Render information about the merge rate."""
   315      dts = results.dts
   316      ax_merged.yaxis.tick_right()
   317      merge_rate = results.merge_rate
   318      color = merge_color(results.active_merge_rate[1][-1])
   319      p_day, = ax_merged.plot(dts, merge_rate[1], '%s-' % color)
   320      p_active, = ax_merged.plot(dts, results.active_merge_rate[1], '%s:' % color)
   321      p_week, = ax_merged.plot(dts, merge_rate[14], 'k:')
   322      ax_merged.set_ylabel('Merge rate')
   323      ax_merged.legend(
   324          [p_active, p_day, p_week],
   325          ['active rate: %.1f PRs/day' % results.active_merge_rate[1][-1],
   326           'real rate: %.1f PRs/day' % merge_rate[1][-1],
   327           'real 14d avg: %.1f PRs/day' % merge_rate[14][-1]],
   328          'lower left',
   329          fontsize='x-small',
   330      )
   331  
   332  
   333  def render_health(results, ax_health):
   334      """Render the percentage of time the queue is healthy/online."""
   335      # pylint: disable=too-many-locals
   336      dts = results.dts
   337      happiness = results.happiness
   338      ax_health.yaxis.tick_right()
   339  
   340      health_color = '%s-' % happy_color(happiness[1][-1])
   341      p_1dhealth, = ax_health.plot(dts, happiness[1], health_color)
   342      p_14dhealth, = ax_health.plot(dts, happiness[14], 'k:')
   343      cur = 100 * happiness[1][-1]
   344      cur14 = 100 * happiness[14][-1]
   345      ax_health.set_ylabel('Unblocked %')
   346  
   347      ax_health.set_ylim([0.0, 1.0])
   348      ax_health.set_xlim(
   349          left=datetime.datetime.now() - datetime.timedelta(days=DAYS))
   350  
   351      for start, end in results.blocked_intervals:
   352          ax_health.axvspan(start, end, alpha=0.2, color='brown', linewidth=0)
   353      for start, end in results.offline_intervals:
   354          ax_health.axvspan(start, end, alpha=0.2, color='black', linewidth=0)
   355  
   356      patches = [
   357          p_1dhealth,
   358          p_14dhealth,
   359          mpatches.Patch(color='brown', label='blocked', alpha=0.2),
   360          mpatches.Patch(color='black', label='offline', alpha=0.2),
   361      ]
   362  
   363      ax_health.legend(
   364          patches,
   365          ['1d avg: %.1f%%' % cur, '14d avg: %.1f%%' % cur14, 'blocked', 'offline'],
   366          'lower left',
   367          fontsize='x-small',
   368      )
   369  
   370  
   371  def render_queue(results, ax_open):
   372      """Render the queue graph (open prs, queued, prs)."""
   373      dts = results.dts
   374      prs = results.prs
   375      queued = results.queued
   376      queue_avg = results.queue_avg
   377      ax_queued = ax_open.twinx()
   378      p_open, = ax_open.plot(dts, prs, 'b-')
   379      color_depth = depth_color(queued[-1])
   380      p_queue, = ax_queued.plot(dts, queued, color_depth)
   381      p_14dqueue, = ax_queued.plot(dts, queue_avg, 'k:')
   382      ax_queued.legend(
   383          [p_open, p_queue, p_14dqueue],
   384          [
   385              'open: %d PRs' % prs[-1],
   386              'approved: %d PRs' % queued[-1],
   387              '14d avg: %.1f PRs' % queue_avg[-1],
   388          ],
   389          'lower left',
   390          fontsize='x-small',
   391      )
   392  
   393  
   394  
   395  def render(results, out_file):
   396      """Render three graphs to outfile from results."""
   397      fig, (ax_backlog, ax_merges, ax_open, ax_health) = plt.subplots(
   398          4, sharex=True, figsize=(16, 10), dpi=100)
   399  
   400      fig.autofmt_xdate()
   401      plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
   402      plt.gca().xaxis.set_major_locator(mdates.DayLocator())
   403      if results.dts:
   404          render_queue(results, ax_open)
   405          render_merges(results, ax_merges)
   406          render_backlog(results, ax_backlog)
   407          render_health(results, ax_health)
   408          fig.text(
   409              0.1, 0.00,
   410              'image: %s, sample: %s' % (
   411                  datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M'),
   412                  results.dts[-1].strftime('%Y-%m-%d %H:%M'),
   413              ),
   414              horizontalalignment='left',
   415              fontsize='x-small',
   416              color=fresh_color(results.dts[-1]),
   417          )
   418  
   419      plt.savefig(out_file, bbox_inches='tight', format='svg')
   420      plt.close()
   421  
   422  
   423  def render_forever(history_uri, img_uri, service_account=None):
   424      """Download results from history_uri, render to svg and save to img_uri."""
   425      if service_account:
   426          print >>sys.stderr, 'Activating service account using: %s' % (
   427              service_account)
   428          subprocess.check_call([
   429              'gcloud',
   430              'auth',
   431              'activate-service-account',
   432              '--key-file=%s' % service_account,
   433          ])
   434      buf = cStringIO.StringIO()
   435      while True:
   436          print >>sys.stderr, 'Truncate render buffer'
   437          buf.seek(0)
   438          buf.truncate()
   439          print >>sys.stderr, 'Cat latest results from %s...' % history_uri
   440          try:
   441              history = subprocess.check_output(
   442                  ['gsutil', '-q', 'cat', history_uri])
   443          except subprocess.CalledProcessError:
   444              traceback.print_exc()
   445              time.sleep(10)
   446              continue
   447  
   448          print >>sys.stderr, 'Render results to buffer...'
   449          with gzip.GzipFile(
   450              os.path.basename(img_uri), mode='wb', fileobj=buf) as compressed:
   451              results = Results()
   452              output(history.split('\n')[-60*24*DAYS:], results)  # Last 21 days
   453              render(results, compressed)
   454  
   455          print >>sys.stderr, 'Copy buffer to %s...' % img_uri
   456          proc = subprocess.Popen(
   457              ['gsutil', '-q',
   458               '-h', 'Content-Type:image/svg+xml',
   459               '-h', 'Cache-Control:public, max-age=%d' % (
   460                   60 if service_account else 5),
   461               '-h', 'Content-Encoding:gzip',  # GCS decompresses if necessary
   462               'cp', '-a', 'public-read', '-', img_uri],
   463              stdin=subprocess.PIPE)
   464          proc.communicate(buf.getvalue())
   465          code = proc.wait()
   466          if code:
   467              print >>sys.stderr, 'Failed to copy rendering to %s: %d' % (
   468                  img_uri, code)
   469          time.sleep(60)
   470  
   471  
   472  
   473  if __name__ == '__main__':
   474      # log all arguments.
   475      pprint.PrettyPrinter(stream=sys.stderr).pprint(sys.argv)
   476  
   477      render_forever(*sys.argv[1:])  # pylint: disable=no-value-for-parameter