github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/internal/loganal/classify.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package loganal
     6  
     7  import (
     8  	"regexp"
     9  	"strings"
    10  )
    11  
    12  var (
    13  	canonMsg = regexp.MustCompile(`[0-9]+`)
    14  
    15  	// numberWords matches words that consist of both letters and
    16  	// digits. Since this is meant to canonicalize numeric fields
    17  	// of error messages, we accept any Unicode letter, but only
    18  	// digits 0-9. We match the whole word to catch things like
    19  	// hexadecimal and temporary file names.
    20  	numberWords = regexp.MustCompile(`\pL*[0-9][\pL0-9]*`)
    21  )
    22  
    23  func (f *Failure) canonicalMessage() string {
    24  	// Do we need to do anything to the message?
    25  	for _, c := range f.Message {
    26  		if '0' <= c && c <= '9' {
    27  			goto rewrite
    28  		}
    29  	}
    30  	return f.Message
    31  
    32  rewrite:
    33  	// Canonicalize any "word" of the message containing numbers.
    34  	//
    35  	// TODO: "Escape" any existing … to make this safe as a key
    36  	// for later use with canonicalFields (direct use is
    37  	// unimportant).
    38  	return numberWords.ReplaceAllString(f.Message, "…")
    39  }
    40  
    41  func (f *Failure) canonicalFields() []string {
    42  	fields := []string{}
    43  	msg := f.Message
    44  	for len(msg) > 0 {
    45  		next := numberWords.FindStringIndex(msg)
    46  		if next == nil {
    47  			fields = append(fields, msg)
    48  			break
    49  		}
    50  		if next[0] > 0 {
    51  			fields = append(fields, msg[:next[0]])
    52  		}
    53  		fields = append(fields, msg[next[0]:next[1]])
    54  		msg = msg[next[1]:]
    55  	}
    56  	return fields
    57  }
    58  
    59  // Classify groups a set of failures in to canonicalized failure
    60  // classes. The returned map maps from each failure class to the
    61  // indexes of the input failures in that class. Each input failure
    62  // will be in exactly one failure class.
    63  func Classify(fs []*Failure) map[Failure][]int {
    64  	// Map maximally canonicalized failures to input indexes.
    65  	canon := map[Failure][]int{}
    66  	for i, f := range fs {
    67  		// TODO: Match up nearby line numbers?
    68  		key := Failure{
    69  			Package:  f.Package,
    70  			Test:     f.Test,
    71  			Message:  f.canonicalMessage(),
    72  			Function: f.Function,
    73  			File:     f.File,
    74  		}
    75  
    76  		canon[key] = append(canon[key], i)
    77  	}
    78  
    79  	// De-canonicalize fields that all of the failures in a class
    80  	// have a common.
    81  	out := make(map[Failure][]int, len(canon))
    82  	for key, class := range canon {
    83  		if len(class) == 1 {
    84  			out[key] = class
    85  			continue
    86  		}
    87  
    88  		// Does the message need de-canonicalization?
    89  		if key.Message != fs[class[0]].Message {
    90  			fields := fs[class[0]].canonicalFields()
    91  			for _, fi := range class[1:] {
    92  				nfields := fs[fi].canonicalFields()
    93  				for i, field := range fields {
    94  					if field != nfields[i] {
    95  						fields[i] = "…"
    96  					}
    97  				}
    98  			}
    99  			key.Message = strings.Join(fields, "")
   100  		}
   101  
   102  		// De-canonicalize Line, OS, and Arch.
   103  		line, os, arch := fs[class[0]].Line, fs[class[0]].OS, fs[class[0]].Arch
   104  		for _, fi := range class[1:] {
   105  			if fs[fi].Line != line {
   106  				line = 0
   107  			}
   108  			if fs[fi].OS != os {
   109  				os = ""
   110  			}
   111  			if fs[fi].Arch != arch {
   112  				arch = ""
   113  			}
   114  		}
   115  		key.Line, key.OS, key.Arch = line, os, arch
   116  
   117  		out[key] = class
   118  	}
   119  
   120  	return out
   121  }