github.com/anchore/syft@v1.38.2/internal/regex_helpers.go (about)

     1  package internal
     2  
     3  import (
     4  	"io"
     5  	"regexp"
     6  )
     7  
     8  const readerChunkSize = 1024 * 1024
     9  
    10  // MatchNamedCaptureGroups takes a regular expression and string and returns all of the named capture group results in a map.
    11  // This is only for the first match in the regex. Callers shouldn't be providing regexes with multiple capture groups with the same name.
    12  func MatchNamedCaptureGroups(regEx *regexp.Regexp, content string) map[string]string {
    13  	// note: we are looking across all matches and stopping on the first non-empty match. Why? Take the following example:
    14  	// input: "cool something to match against" pattern: `((?P<name>match) (?P<version>against))?`. Since the pattern is
    15  	// encapsulated in an optional capture group, there will be results for each character, but the results will match
    16  	// on nothing. The only "true" match will be at the end ("match against").
    17  	allMatches := regEx.FindAllStringSubmatch(content, -1)
    18  	var results map[string]string
    19  	for _, match := range allMatches {
    20  		// fill a candidate results map with named capture group results, accepting empty values, but not groups with
    21  		// no names
    22  		for nameIdx, name := range regEx.SubexpNames() {
    23  			if nameIdx > len(match) || len(name) == 0 {
    24  				continue
    25  			}
    26  			if results == nil {
    27  				results = make(map[string]string)
    28  			}
    29  			results[name] = match[nameIdx]
    30  		}
    31  		// note: since we are looking for the first best potential match we should stop when we find the first one
    32  		// with non-empty results.
    33  		if !isEmptyMap(results) {
    34  			break
    35  		}
    36  	}
    37  	return results
    38  }
    39  
    40  // MatchNamedCaptureGroupsFromReader matches named capture groups from a reader, assuming the pattern fits within
    41  // 1.5x the reader chunk size (1MB * 1.5).
    42  func MatchNamedCaptureGroupsFromReader(re *regexp.Regexp, r io.Reader) (map[string]string, error) {
    43  	results := make(map[string]string)
    44  	matches, err := processReaderInChunks(r, readerChunkSize, matchNamedCaptureGroupsHandler(re, results))
    45  	if err != nil {
    46  		return nil, err
    47  	}
    48  	if !matches {
    49  		return nil, nil
    50  	}
    51  	return results, nil
    52  }
    53  
    54  // MatchAnyFromReader matches any of the provided regular expressions from a reader, assuming the pattern fits within
    55  // 1.5x the reader chunk size (1MB * 1.5).
    56  func MatchAnyFromReader(r io.Reader, res ...*regexp.Regexp) (bool, error) {
    57  	return processReaderInChunks(r, readerChunkSize, matchAnyHandler(res))
    58  }
    59  
    60  func matchNamedCaptureGroupsHandler(re *regexp.Regexp, results map[string]string) func(data []byte) (bool, error) {
    61  	return func(data []byte) (bool, error) {
    62  		if match := re.FindSubmatch(data); match != nil {
    63  			groupNames := re.SubexpNames()
    64  			for i, name := range groupNames {
    65  				if i > 0 && name != "" {
    66  					results[name] = string(match[i])
    67  				}
    68  			}
    69  			return true, nil
    70  		}
    71  		return false, nil
    72  	}
    73  }
    74  
    75  func matchAnyHandler(res []*regexp.Regexp) func(data []byte) (bool, error) {
    76  	return func(data []byte) (bool, error) {
    77  		for _, re := range res {
    78  			if re.Match(data) {
    79  				return true, nil
    80  			}
    81  		}
    82  		return false, nil
    83  	}
    84  }
    85  
    86  // processReaderInChunks reads from the provided reader in chunks and calls the provided handler with each chunk + portion of the previous neighboring chunk.
    87  // Note that we only overlap the last half of the previous chunk with the current chunk to avoid missing matches that span chunk boundaries.
    88  func processReaderInChunks(rdr io.Reader, chunkSize int, handler func(data []byte) (bool, error)) (bool, error) {
    89  	half := chunkSize / 2
    90  	bufSize := chunkSize + half
    91  	buf := make([]byte, bufSize)
    92  	lastRead := 0
    93  
    94  	for {
    95  		offset := half
    96  		if lastRead < half {
    97  			offset = lastRead
    98  		}
    99  		start := half - offset
   100  		if lastRead > 0 {
   101  			copy(buf[start:], buf[half+offset:half+lastRead])
   102  		}
   103  		n, err := rdr.Read(buf[half:])
   104  		if err != nil {
   105  			break
   106  		}
   107  
   108  		// process the combined data with the handler
   109  		matched, handlerErr := handler(buf[start : half+n])
   110  		if handlerErr != nil {
   111  			return false, handlerErr
   112  		}
   113  		if matched {
   114  			return true, nil
   115  		}
   116  
   117  		lastRead = n
   118  	}
   119  
   120  	return false, nil
   121  }
   122  
   123  func isEmptyMap(m map[string]string) bool {
   124  	if len(m) == 0 {
   125  		return true
   126  	}
   127  	for _, value := range m {
   128  		if value != "" {
   129  			return false
   130  		}
   131  	}
   132  	return true
   133  }