go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/compilefailureanalysis/heuristic/signal_extractor.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package heuristic
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"regexp"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"go.chromium.org/luci/bisection/model"
    26  	"go.chromium.org/luci/bisection/util"
    27  )
    28  
    29  const (
    30  	// Patterns for Python stack trace frames.
    31  	PYTHON_STACK_TRACE_FRAME_PATTERN_1 = `File "(?P<file>.+\.py)", line (?P<line>[0-9]+), in (?P<function>.+)`
    32  	PYTHON_STACK_TRACE_FRAME_PATTERN_2 = `(?P<function>[^\s]+) at (?P<file>.+\.py):(?P<line>[0-9]+)`
    33  	// Match file path separator: "/", "//", "\", "\\".
    34  	PATH_SEPARATOR_PATTERN = `(?:/{1,2}|\\{1,2})`
    35  
    36  	// Match drive root directory on Windows, like "C:/" or "C:\\".
    37  	WINDOWS_ROOT_PATTERN = `[a-zA-Z]:` + PATH_SEPARATOR_PATTERN
    38  
    39  	// Match system root directory on Linux/Mac.
    40  	UNIX_ROOT_PATTERN = `/+`
    41  
    42  	// Match system/drive root on Linux/Mac/Windows.
    43  	ROOT_DIR_PATTERN = "(?:" + WINDOWS_ROOT_PATTERN + "|" + UNIX_ROOT_PATTERN + ")"
    44  
    45  	// Match file/directory names and also match ., ..
    46  	FILE_NAME_PATTERN = `[\w\.-]+`
    47  
    48  	// Mark the beginning of the failure section in stdout log
    49  	FAILURE_SECTION_START_PREFIX = "FAILED: "
    50  
    51  	// Mark the end of the failure section in stdout log
    52  	FAILURE_SECTION_END_PATTERN_1 = `^\d+ errors? generated.`
    53  	FAILURE_SECTION_END_PATTERN_2 = `failed with exit code \d+`
    54  	// If it reads this line, it is also ends of failure section
    55  	OUTSIDE_FAILURE_SECTION_PATTERN = `\[\d+/\d+\]`
    56  
    57  	NINJA_FAILURE_LINE_END_PREFIX = `ninja: build stopped`
    58  	NINJA_ERROR_LINE_PREFIX       = `ninja: error`
    59  
    60  	STDLOG_NODE_PATTERN = `(?:"([^"]+)")|(\S+)`
    61  )
    62  
    63  // ExtractSignals extracts necessary signals for heuristic analysis from logs
    64  func ExtractSignals(c context.Context, compileLogs *model.CompileLogs) (*model.CompileFailureSignal, error) {
    65  	if compileLogs.NinjaLog == nil && compileLogs.StdOutLog == "" {
    66  		return nil, fmt.Errorf("Unable to extract signals from empty logs.")
    67  	}
    68  	// Prioritise extracting signals from ninja logs instead of stdout logs
    69  	if compileLogs.NinjaLog != nil {
    70  		return ExtractSignalsFromNinjaLog(c, compileLogs.NinjaLog)
    71  	}
    72  	return ExtractSignalsFromStdoutLog(c, compileLogs.StdOutLog)
    73  }
    74  
    75  // ExtractSignalsFromNinjaLog extracts necessary signals for heuristic analysis from ninja log
    76  func ExtractSignalsFromNinjaLog(c context.Context, ninjaLog *model.NinjaLog) (*model.CompileFailureSignal, error) {
    77  	signal := &model.CompileFailureSignal{}
    78  	for _, failure := range ninjaLog.Failures {
    79  		edge := &model.CompileFailureEdge{
    80  			Rule:         failure.Rule,
    81  			OutputNodes:  failure.OutputNodes,
    82  			Dependencies: normalizeDependencies(failure.Dependencies),
    83  		}
    84  		signal.Edges = append(signal.Edges, edge)
    85  		signal.Nodes = append(signal.Nodes, failure.OutputNodes...)
    86  		e := extractFiles(signal, failure.Output)
    87  		if e != nil {
    88  			return nil, e
    89  		}
    90  	}
    91  	return signal, nil
    92  }
    93  
    94  func extractFiles(signal *model.CompileFailureSignal, output string) error {
    95  	pythonPatterns := []*regexp.Regexp{
    96  		regexp.MustCompile(PYTHON_STACK_TRACE_FRAME_PATTERN_1),
    97  		regexp.MustCompile(PYTHON_STACK_TRACE_FRAME_PATTERN_2),
    98  	}
    99  	filePathLinePattern := regexp.MustCompile(getFileLinePathPatternStr())
   100  
   101  	lines := strings.Split(output, "\n")
   102  	for i, line := range lines {
   103  		// Do not extract the first line
   104  		if i == 0 {
   105  			continue
   106  		}
   107  		// Check if the line matches python pattern
   108  		matchedPython := false
   109  		for _, pythonPattern := range pythonPatterns {
   110  			matches, err := util.MatchedNamedGroup(pythonPattern, line)
   111  			if err == nil {
   112  				pyLine, e := strconv.Atoi(matches["line"])
   113  				if e != nil {
   114  					return e
   115  				}
   116  				signal.AddLine(util.NormalizeFilePath(matches["file"]), pyLine)
   117  				matchedPython = true
   118  				continue
   119  			}
   120  		}
   121  		if matchedPython {
   122  			continue
   123  		}
   124  		// Non-python cases
   125  		matches := filePathLinePattern.FindAllStringSubmatch(line, -1)
   126  		if matches != nil {
   127  			for _, match := range matches {
   128  				if len(match) != 3 {
   129  					return fmt.Errorf("Invalid line: %s", line)
   130  				}
   131  				// match[1] is file, match[2] is line number
   132  				if match[2] == "" {
   133  					signal.AddFilePath(util.NormalizeFilePath(match[1]))
   134  				} else {
   135  					lineInt, e := strconv.Atoi(match[2])
   136  					if e != nil {
   137  						return e
   138  					}
   139  					signal.AddLine(util.NormalizeFilePath(match[1]), lineInt)
   140  				}
   141  			}
   142  		}
   143  	}
   144  	return nil
   145  }
   146  
   147  func extractFilesFromLine(signal *model.CompileFailureSignal, line string) error {
   148  	pythonPatterns := []*regexp.Regexp{
   149  		regexp.MustCompile(PYTHON_STACK_TRACE_FRAME_PATTERN_1),
   150  		regexp.MustCompile(PYTHON_STACK_TRACE_FRAME_PATTERN_2),
   151  	}
   152  	filePathLinePattern := regexp.MustCompile(getFileLinePathPatternStr())
   153  
   154  	// Check if the line matches python pattern
   155  	matchedPython := false
   156  	for _, pythonPattern := range pythonPatterns {
   157  		matches, err := util.MatchedNamedGroup(pythonPattern, line)
   158  		if err == nil {
   159  			pyLine, e := strconv.Atoi(matches["line"])
   160  			if e != nil {
   161  				return e
   162  			}
   163  			signal.AddLine(util.NormalizeFilePath(matches["file"]), pyLine)
   164  			matchedPython = true
   165  			continue
   166  		}
   167  	}
   168  	if matchedPython {
   169  		return nil
   170  	}
   171  	// Non-python cases
   172  	matches := filePathLinePattern.FindAllStringSubmatch(line, -1)
   173  	if matches != nil {
   174  		for _, match := range matches {
   175  			if len(match) != 3 {
   176  				return fmt.Errorf("Invalid line: %s", line)
   177  			}
   178  			// match[1] is file, match[2] is line number
   179  			if match[2] == "" {
   180  				signal.AddFilePath(util.NormalizeFilePath(match[1]))
   181  			} else {
   182  				lineInt, e := strconv.Atoi(match[2])
   183  				if e != nil {
   184  					return e
   185  				}
   186  				signal.AddLine(util.NormalizeFilePath(match[1]), lineInt)
   187  			}
   188  		}
   189  	}
   190  	return nil
   191  }
   192  
   193  func normalizeDependencies(dependencies []string) []string {
   194  	result := []string{}
   195  	for _, dependency := range dependencies {
   196  		result = append(result, util.NormalizeFilePath(dependency))
   197  	}
   198  	return result
   199  }
   200  
   201  // ExtractSignalsFromStdoutLog extracts necessary signals for heuristic analysis from stdout log
   202  func ExtractSignalsFromStdoutLog(c context.Context, stdoutLog string) (*model.CompileFailureSignal, error) {
   203  	signal := &model.CompileFailureSignal{}
   204  	lines := strings.Split(stdoutLog, "\n")
   205  	failureSectionEndPattern1 := regexp.MustCompile(FAILURE_SECTION_END_PATTERN_1)
   206  	failureSectionEndPattern2 := regexp.MustCompile(FAILURE_SECTION_END_PATTERN_2)
   207  	outsideFailureSectionPattern := regexp.MustCompile(OUTSIDE_FAILURE_SECTION_PATTERN)
   208  	failureStarted := false
   209  	for _, line := range lines {
   210  		line = strings.Trim(line, " \t")
   211  		if strings.HasPrefix(line, FAILURE_SECTION_START_PREFIX) {
   212  			failureStarted = true
   213  			line = line[len(FAILURE_SECTION_START_PREFIX):]
   214  			signal.Nodes = append(signal.Nodes, extractNodes(line)...)
   215  			continue
   216  		} else if failureStarted && strings.HasPrefix(line, NINJA_FAILURE_LINE_END_PREFIX) {
   217  			// End parsing
   218  			break
   219  		} else if failureStarted && (failureSectionEndPattern1.MatchString(line) || failureSectionEndPattern2.MatchString(line) || outsideFailureSectionPattern.MatchString(line)) {
   220  			failureStarted = false
   221  		}
   222  
   223  		if failureStarted || strings.HasPrefix(line, NINJA_ERROR_LINE_PREFIX) {
   224  			extractFilesFromLine(signal, line)
   225  		}
   226  	}
   227  	return signal, nil
   228  }
   229  
   230  // extractNode returns the list of failed output nodes.
   231  // Possible format:
   232  // FAILED: obj/path/to/file.o
   233  // FAILED: target.exe
   234  // FAILED: "target with space in name"
   235  func extractNodes(line string) []string {
   236  	pattern := regexp.MustCompile(STDLOG_NODE_PATTERN)
   237  	matches := pattern.FindAllStringSubmatch(line, -1)
   238  	result := []string{}
   239  	for _, match := range matches {
   240  		for i := 1; i <= 2; i++ {
   241  			if match[i] != "" {
   242  				result = append(result, match[i])
   243  			}
   244  		}
   245  	}
   246  	return result
   247  }
   248  
   249  // getFileLinePathPatternStr matches a full file path and line number.
   250  // It could match files with or without line numbers like below:
   251  //
   252  //	c:\\a\\b.txt:12
   253  //	c:\a\b.txt(123)
   254  //	c:\a\b.txt:[line 123]
   255  //	D:/a/b.txt
   256  //	/a/../b/./c.txt
   257  //	a/b/c.txt
   258  //	//BUILD.gn:246
   259  func getFileLinePathPatternStr() string {
   260  	pattern := `(`
   261  	pattern += ROOT_DIR_PATTERN + "?"                                    // System/Drive root directory.
   262  	pattern += `(?:` + FILE_NAME_PATTERN + PATH_SEPARATOR_PATTERN + `)*` // Directories.
   263  	pattern += FILE_NAME_PATTERN + `\.` + getFileExtensionPatternStr()
   264  	pattern += `)`                           // File name and extension.
   265  	pattern += `(?:(?:[\(:]|\[line )(\d+))?` // Line number might not be available.
   266  	return pattern
   267  }
   268  
   269  // getFileExtensionPattern matches supported file extensions.
   270  // Sort extension list to avoid non-full match like 'c' matching 'c' in 'cpp'.
   271  func getFileExtensionPatternStr() string {
   272  	extensions := getSupportedFileExtension()
   273  	sort.Sort(sort.Reverse(sort.StringSlice(extensions)))
   274  	return fmt.Sprintf("(?:%s)", strings.Join(extensions, "|"))
   275  }
   276  
   277  // getSupportedFileExtension get gile extensions to filter out files from log.
   278  func getSupportedFileExtension() []string {
   279  	return []string{
   280  		"c",
   281  		"cc",
   282  		"cpp",
   283  		"css",
   284  		"exe",
   285  		"gn",
   286  		"gni",
   287  		"gyp",
   288  		"gypi",
   289  		"h",
   290  		"hh",
   291  		"html",
   292  		"idl",
   293  		"isolate",
   294  		"java",
   295  		"js",
   296  		"json",
   297  		"m",
   298  		"mm",
   299  		"mojom",
   300  		"nexe",
   301  		"o",
   302  		"obj",
   303  		"py",
   304  		"pyc",
   305  		"rc",
   306  		"sh",
   307  		"sha1",
   308  		"ts",
   309  		"txt",
   310  	}
   311  }