github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/triage/summarize.py (about)

     1  #!/usr/bin/env python2
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  '''
    18  Summarize groups failed tests together by finding edit distances between their failure strings,
    19  and emits JSON for rendering in a browser.
    20  '''
    21  
    22  # pylint: disable=invalid-name,missing-docstring
    23  
    24  
    25  import argparse
    26  import functools
    27  import hashlib
    28  import json
    29  import os
    30  import re
    31  import sys
    32  import time
    33  import zlib
    34  
    35  import berghelroach
    36  
    37  editdist = berghelroach.dist
    38  
    39  flakeReasonDateRE = re.compile(
    40      r'[A-Z][a-z]{2}, \d+ \w+ 2\d{3} [\d.-: ]*([-+]\d+)?|'
    41      r'\w{3}\s+\d{1,2} \d+:\d+:\d+(\.\d+)?|(\d{4}-\d\d-\d\d.|.\d{4} )\d\d:\d\d:\d\d(.\d+)?')
    42  # Find random noisy strings that should be replaced with renumbered strings, for more similarity.
    43  flakeReasonOrdinalRE = re.compile(
    44      r'0x[0-9a-fA-F]+' # hex constants
    45      r'|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?' # IPs + optional port
    46      r'|[0-9a-fA-F]{8}-\S{4}-\S{4}-\S{4}-\S{12}(-\d+)?' # UUIDs + trailing digits
    47      r'|[0-9a-f]{12,32}' # hex garbage
    48      r'|(?<=minion-group-|default-pool-)[-0-9a-z]{4,}'  # node names
    49  )
    50  
    51  
    52  def normalize(s):
    53      """
    54      Given a traceback or error message from a text, reduce excess entropy to make
    55      clustering easier.
    56  
    57      This includes:
    58      - blanking dates and timestamps
    59      - renumbering unique information like
    60          - pointer addresses
    61          - UUIDs
    62          - IP addresses
    63      - sorting randomly ordered map[] strings.
    64      """
    65  
    66      # blank out dates
    67      s = flakeReasonDateRE.sub('TIME', s)
    68  
    69      # do alpha conversion-- rename random garbage strings (hex pointer values, node names, etc)
    70      # into 'UNIQ1', 'UNIQ2', etc.
    71      matches = {}
    72      def repl(m):
    73          s = m.group(0)
    74          if s not in matches:
    75              matches[s] = 'UNIQ%d' % (len(matches) + 1)
    76          return matches[s]
    77  
    78      if 'map[' in s:
    79          # Go's maps are in a random order. Try to sort them to reduce diffs.
    80          s = re.sub(r'map\[([^][]*)\]',
    81                     lambda m: 'map[%s]' % ' '.join(sorted(m.group(1).split())),
    82                     s)
    83  
    84      s = flakeReasonOrdinalRE.sub(repl, s)
    85  
    86      if len(s) > 10000:
    87          # for long strings, remove repeated lines!
    88          s = re.sub(r'(?m)^(.*\n)\1+', r'\1', s)
    89  
    90      if len(s) > 200000:  # ridiculously long test output
    91          s = s[:100000] + '\n...[truncated]...\n' + s[-100000:]
    92  
    93      return s
    94  
    95  def normalize_name(name):
    96      """
    97      Given a test name, remove [...]/{...}.
    98  
    99      Matches code in testgrid and kubernetes/hack/update_owners.py.
   100      """
   101      name = re.sub(r'\[.*?\]|\{.*?\}', '', name)
   102      name = re.sub(r'\s+', ' ', name)
   103      return name.strip()
   104  
   105  
   106  def make_ngram_counts(s, ngram_counts={}):
   107      """
   108      Convert a string into a histogram of frequencies for different byte combinations.
   109      This can be used as a heuristic to estimate edit distance between two strings in
   110      constant time.
   111  
   112      Instead of counting each ngram individually, they are hashed into buckets.
   113      This makes the output count size constant.
   114      """
   115  
   116      # Yes, I'm intentionally memoizing here.
   117      # pylint: disable=dangerous-default-value
   118  
   119      size = 64
   120      if s not in ngram_counts:
   121          counts = [0] * size
   122          for x in xrange(len(s)-3):
   123              counts[zlib.crc32(s[x:x+4].encode('utf8')) & (size - 1)] += 1
   124          ngram_counts[s] = counts  # memoize
   125      return ngram_counts[s]
   126  
   127  
   128  def ngram_editdist(a, b):
   129      """
   130      Compute a heuristic lower-bound edit distance using ngram counts.
   131  
   132      An insert/deletion/substitution can cause up to 4 ngrams to differ:
   133  
   134      abcdefg => abcefg
   135      (abcd, bcde, cdef, defg) => (abce, bcef, cefg)
   136  
   137      This will underestimate the edit distance in many cases:
   138      - ngrams hashing into the same bucket will get confused
   139      - a large-scale transposition will barely disturb ngram frequencies,
   140        but will have a very large effect on edit distance.
   141  
   142      It is useful to avoid more expensive precise computations when they are
   143      guaranteed to exceed some limit (being a lower bound), or as a proxy when
   144      the exact edit distance computation is too expensive (for long inputs).
   145      """
   146      counts_a = make_ngram_counts(a)
   147      counts_b = make_ngram_counts(b)
   148      return sum(abs(x-y) for x, y in zip(counts_a, counts_b))/4
   149  
   150  
   151  def make_ngram_counts_digest(s):
   152      """
   153      Returns a hashed version of the ngram counts.
   154      """
   155      return hashlib.sha1(str(make_ngram_counts(s))).hexdigest()[:20]
   156  
   157  
   158  def file_memoize(description, name):
   159      """
   160      Decorator to save a function's results to a file.
   161      """
   162      def inner(func):
   163          @functools.wraps(func)
   164          def wrapper(*args, **kwargs):
   165              if os.path.exists(name):
   166                  data = json.load(open(name))
   167                  print 'done (cached)', description
   168                  return data
   169              data = func(*args, **kwargs)
   170              json.dump(data, open(name, 'w'))
   171              print 'done', description
   172              return data
   173          wrapper.__wrapped__ = func
   174          return wrapper
   175      return inner
   176  
   177  
   178  @file_memoize('loading failed tests', 'failed.json')
   179  def load_failures(builds_file, tests_file):
   180      builds = {}
   181      for build in json.load(open(builds_file)):
   182          if not build['started'] or not build['number']:
   183              continue
   184          for attr in ('started', 'tests_failed', 'number', 'tests_run'):
   185              build[attr] = int(build[attr])
   186          build['elapsed'] = int(float(build['elapsed']))
   187          if 'pr-logs' in build['path']:
   188              build['pr'] = build['path'].split('/')[-3]
   189          builds[build['path']] = build
   190  
   191      failed_tests = {}
   192      for test in json.load(open(tests_file)):
   193          failed_tests.setdefault(test['name'], []).append(test)
   194      for tests in failed_tests.itervalues():
   195          tests.sort(key=lambda t: t['build'])
   196  
   197      return builds, failed_tests
   198  
   199  
   200  def find_match(fnorm, clusters):
   201      for ngram_dist, other in sorted((ngram_editdist(fnorm, x), x) for x in clusters):
   202          # allow up to 10% differences
   203          limit = int((len(fnorm)+len(other))/2.0 * 0.10)
   204  
   205          if ngram_dist > limit:
   206              continue
   207  
   208          if limit <= 1 and other != fnorm:  # no chance
   209              continue
   210  
   211          dist = editdist(fnorm, other, limit)
   212  
   213          if dist < limit:
   214              return other
   215  
   216  
   217  def cluster_test(tests):
   218      """
   219      Compute failure clusters given a list of failures for one test.
   220  
   221      Args:
   222          tests: list of failed test dictionaries, with 'failure_text' keys
   223      Returns:
   224          {failure_text: [failure_in_cluster_1, failure_in_cluster_2, ...]}
   225      """
   226      clusters = {}
   227      start = time.time()
   228  
   229      for test in tests:
   230          ftext = test['failure_text']
   231          fnorm = normalize(ftext)
   232          if fnorm in clusters:
   233              clusters[fnorm].append(test)
   234          else:
   235              other = find_match(fnorm, clusters)
   236              if other:
   237                  clusters[other].append(test)
   238              else:
   239                  clusters[fnorm] = [test]
   240          if time.time() > start + 60:
   241              print 'bailing early, taking too long!'
   242              break
   243      return clusters
   244  
   245  
   246  @file_memoize('clustering inside each test', 'failed_clusters_local.json')
   247  def cluster_local(failed_tests):
   248      """Cluster together the failures for each test. """
   249      clustered = {}
   250      for test_name, tests in sorted(failed_tests.iteritems(), key=lambda x: len(x[1]), reverse=True):
   251          print len(tests), test_name,
   252          sys.stdout.flush()
   253          clustered[test_name] = cluster_test(tests)
   254          print len(clustered[test_name])
   255      return clustered
   256  
   257  
   258  @file_memoize('clustering across tests', 'failed_clusters_global.json')
   259  def cluster_global(clustered, previous_clustered):
   260      """Combine together clustered failures for each test.
   261  
   262      This is done hierarchically for efficiency-- each test's failures are likely to be similar,
   263      reducing the number of clusters that need to be paired up at this stage.
   264  
   265      Args:
   266          {test_name: {failure_text: [failure_1, failure_2, ...], ...}, ...}
   267      Returns:
   268          {failure_text: [(test_name, [failure_1, failure_2, ...]), ...], ...}
   269      """
   270      clusters = {}
   271  
   272      if previous_clustered:
   273          # seed clusters using output from the previous run
   274          n = 0
   275          for cluster in previous_clustered:
   276              key = cluster['key']
   277              if key != normalize(key):
   278                  print key
   279                  print normalize(key)
   280                  n += 1
   281                  continue
   282              clusters[cluster['key']] = {}
   283          print 'Seeding with %d previous clusters' % len(clusters)
   284          if n:
   285              print '!!! %d clusters lost from different normalization! !!!' % n
   286  
   287  
   288      for n, (test_name, cluster) in enumerate(
   289              sorted(clustered.iteritems(),
   290                     key=lambda (k, v): sum(len(x) for x in v.itervalues()),
   291                     reverse=True),
   292              1):
   293          print '%d/%d %d %s' % (n, len(clustered), len(cluster), test_name)
   294          for key, tests in sorted(cluster.iteritems(), key=lambda x: len(x[1]), reverse=True):
   295              if key in clusters:
   296                  clusters[key].setdefault(test_name, []).extend(tests)
   297              else:
   298                  other = find_match(key, clusters)
   299                  if other:
   300                      clusters[other].setdefault(test_name, []).extend(tests)
   301                  else:
   302                      clusters[key] = {test_name: list(tests)}
   303  
   304      # If we seeded clusters using the previous run's keys, some of those
   305      # clusters may have disappeared. Remove the resulting empty entries.
   306      for k in {k for k, v in clusters.iteritems() if not v}:
   307          clusters.pop(k)
   308  
   309      return clusters
   310  
   311  
   312  def tests_group_by_job(tests, builds):
   313      """Turn a list of test failures into {job: [buildnumber, ...], ...}"""
   314      groups = {}
   315      for test in tests:
   316          try:
   317              build = builds[test['build']]
   318          except KeyError:
   319              continue
   320          if 'number' in build:
   321              groups.setdefault(build['job'], set()).add(build['number'])
   322      return sorted(((key, sorted(value, reverse=True)) for key, value in groups.iteritems()),
   323                    key=lambda (k, v): (-len(v), k))
   324  
   325  
   326  SPAN_RE = re.compile(r'\w+|\W+')
   327  
   328  def common_spans(xs):
   329      """
   330      Finds something similar to the longest common subsequence of xs, but much faster.
   331  
   332      Returns a list of [matchlen_1, mismatchlen_2, matchlen_2, mismatchlen_2, ...], representing
   333      sequences of the first element of the list that are present in all members.
   334      """
   335      common = None
   336      for x in xs:
   337          x_split = SPAN_RE.findall(x)
   338          if common is None:  # first iteration
   339              common = set(x_split)
   340          else:
   341              common.intersection_update(x_split)
   342  
   343      spans = []
   344      match = True
   345      span_len = 0
   346      for x in SPAN_RE.findall(xs[0]):
   347          if x in common:
   348              if not match:
   349                  match = True
   350                  spans.append(span_len)
   351                  span_len = 0
   352              span_len += len(x)
   353          else:
   354              if match:
   355                  match = False
   356                  spans.append(span_len)
   357                  span_len = 0
   358              span_len += len(x)
   359  
   360      if span_len:
   361          spans.append(span_len)
   362  
   363      return spans
   364  
   365  
   366  def clusters_to_display(clustered, builds):
   367      """Transpose and sort the output of cluster_global."""
   368  
   369      return [{
   370          "key": key,
   371          "id": key_id,
   372          "spans": common_spans([f['failure_text'] for _, fs in clusters for f in fs]),
   373          "text": clusters[0][1][0]['failure_text'],
   374          "tests": [{
   375              "name": test_name,
   376              "jobs": [{"name": n, "builds": b}
   377                       for n, b in tests_group_by_job(tests, builds)]
   378              }
   379                    for test_name, tests in sorted(clusters, key=lambda (n, t): (-len(t), n))
   380                   ]
   381          }
   382              for key, key_id, clusters in clustered if sum(len(x[1]) for x in clusters) > 1
   383             ]
   384  
   385  
   386  def builds_to_columns(builds):
   387      """Convert a list of build dictionaries into a columnar form.
   388  
   389      This compresses much better with gzip."""
   390  
   391      jobs = {}
   392  
   393      cols = {v: [] for v in 'started tests_failed elapsed tests_run result executor pr'.split()}
   394      out = {'jobs': jobs, 'cols': cols, 'job_paths': {}}
   395      for build in sorted(builds.itervalues(), key=lambda b: (b['job'], b['number'])):
   396          if 'number' not in build:
   397              continue
   398          index = len(cols['started'])
   399          for key, entries in cols.iteritems():
   400              entries.append(build.get(key))
   401          job = jobs.setdefault(build['job'], {})
   402          if not job:
   403              out['job_paths'][build['job']] = build['path'][:build['path'].rindex('/')]
   404          job[build['number']] = index
   405  
   406      for k, indexes in jobs.items():
   407          numbers = sorted(indexes)
   408          base = indexes[numbers[0]]
   409          count = len(numbers)
   410  
   411          # optimization: if we have a dense sequential mapping of builds=>indexes,
   412          # store only the first build number, the run length, and the first index number.
   413          if numbers[-1] == numbers[0] + count - 1 and \
   414                  all(indexes[k] == n + base for n, k in enumerate(numbers)):
   415              jobs[k] = [numbers[0], count, base]
   416              for n in numbers:
   417                  assert n <= numbers[0] + len(numbers), (k, n, jobs[k], len(numbers), numbers)
   418  
   419      return out
   420  
   421  
   422  def render(builds, clustered):
   423      clustered_sorted = sorted(
   424          clustered.iteritems(),
   425          key=lambda (k, v): (-sum(len(ts) for ts in v.itervalues()), k))
   426      clustered_tuples = [(k,
   427                           make_ngram_counts_digest(k),
   428                           sorted(clusters.items(), key=lambda (n, t): (-len(t), n)))
   429                          for k, clusters in clustered_sorted]
   430  
   431      return {'clustered': clusters_to_display(clustered_tuples, builds),
   432              'builds': builds_to_columns(builds)}
   433  
   434  
   435  SIG_LABEL_RE = re.compile(r'\[sig-([^]]*)\]')
   436  
   437  def annotate_owners(data, builds, owners):
   438      """
   439      Assign ownership to a cluster based on the share of hits in the last day.
   440      """
   441      owner_re = re.compile(r'(?:%s)' % '|'.join(
   442          '(?P<%s>%s)' % (
   443              sig.replace('-', '_'),  # regex group names can't have -
   444              '|'.join(re.escape(p) for p in prefixes)
   445          )
   446          for sig, prefixes in owners.iteritems()
   447      ))
   448      job_paths = data['builds']['job_paths']
   449      yesterday = max(data['builds']['cols']['started']) - (60 * 60 * 24)
   450  
   451      for cluster in data['clustered']:
   452          owner_counts = {}
   453          for test in cluster['tests']:
   454              m = SIG_LABEL_RE.search(test['name'])
   455              if m:
   456                  owner = m.group(1)
   457              else:
   458                  m = owner_re.match(normalize_name(test['name']))
   459                  if not m or not m.groupdict():
   460                      continue
   461                  owner = next(k for k, v in m.groupdict().iteritems() if v)
   462              owner = owner.replace('_', '-')
   463              counts = owner_counts.setdefault(owner, [0, 0])
   464              for job in test['jobs']:
   465                  if ':' in job['name']:  # non-standard CI
   466                      continue
   467                  job_path = job_paths[job['name']]
   468                  for build in job['builds']:
   469                      if builds['%s/%d' % (job_path, build)]['started'] > yesterday:
   470                          counts[0] += 1
   471                      else:
   472                          counts[1] += 1
   473          if owner_counts:
   474              owner = max(owner_counts.items(), key=lambda (o, c): (c, o))[0]
   475              cluster['owner'] = owner
   476          else:
   477              cluster['owner'] = 'testing'
   478  
   479  
   480  def render_slice(data, builds, prefix='', owner=''):
   481      clustered = []
   482      builds_out = {}
   483      jobs = set()
   484      for cluster in data['clustered']:
   485          # print [cluster['id'], prefix]
   486          if owner and cluster.get('owner') == owner:
   487              clustered.append(cluster)
   488          elif prefix and cluster['id'].startswith(prefix):
   489              clustered.append(cluster)
   490          else:
   491              continue
   492          for test in cluster['tests']:
   493              for job in test['jobs']:
   494                  jobs.add(job['name'])
   495      for path, build in builds.iteritems():
   496          if build['job'] in jobs:
   497              builds_out[path] = build
   498      return {'clustered': clustered, 'builds': builds_to_columns(builds_out)}
   499  
   500  
   501  def parse_args(args):
   502      parser = argparse.ArgumentParser()
   503      parser.add_argument('builds', help='builds.json file from BigQuery')
   504      parser.add_argument('tests', help='tests.json file from BigQuery')
   505      parser.add_argument('--previous', help='previous output', type=argparse.FileType('r'))
   506      parser.add_argument('--owners', help='test owner SIGs', type=argparse.FileType('r'))
   507      parser.add_argument('--output', default='failure_data.json')
   508      parser.add_argument('--output_slices',
   509                          help='Output slices to this path (must include PREFIX in template)')
   510      return parser.parse_args(args)
   511  
   512  
   513  def main(args):
   514      builds, failed_tests = load_failures(args.builds, args.tests)
   515  
   516      previous_clustered = None
   517      if args.previous:
   518          print 'loading previous'
   519          previous_clustered = json.load(args.previous)['clustered']
   520  
   521      clustered_local = cluster_local(failed_tests)
   522      clustered = cluster_global(clustered_local, previous_clustered)
   523  
   524      print '%d clusters' % len(clustered)
   525  
   526      data = render(builds, clustered)
   527  
   528      if args.owners:
   529          owners = json.load(args.owners)
   530          annotate_owners(data, builds, owners)
   531  
   532      json.dump(data, open(args.output, 'w'),
   533                sort_keys=True)
   534  
   535      if args.output_slices:
   536          assert 'PREFIX' in args.output_slices
   537          for subset in range(256):
   538              id_prefix = '%02x' % subset
   539              json.dump(render_slice(data, builds, id_prefix),
   540                        open(args.output_slices.replace('PREFIX', id_prefix), 'w'),
   541                        sort_keys=True)
   542          if args.owners:
   543              owners.setdefault('testing', [])  # for output
   544              for owner in owners:
   545                  json.dump(render_slice(data, builds, prefix='', owner=owner),
   546                            open(args.output_slices.replace('PREFIX', 'sig-' + owner), 'w'),
   547                            sort_keys=True)
   548  
   549  
   550  if __name__ == '__main__':
   551      main(parse_args(sys.argv[1:]))