github.com/google/osv-scalibr@v0.4.1/veles/detect.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package veles
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  )
    23  
    24  const (
    25  	// KiB is one binary Kilobyte (Kibibyte) i.e. 1024 bytes.
    26  	KiB = 1 << 10
    27  	// MiB is one binary Megabyte (Mibibyte).
    28  	MiB = 1 << 20
    29  	// GiB is one binary Gigabyte (Gibibyte).
    30  	GiB = 1 << 30
    31  
    32  	// MinReadLen is the minimum buffer size for reading chunks from an io.Reader.
    33  	MinReadLen = 64 * KiB
    34  
    35  	// MinRetainLen is the minimum number of bytes from the end of a chunk to
    36  	// retain to avoid false negatives from Secrets overlapping the edge of two
    37  	// chunks.
    38  	MinRetainLen = 1 * KiB
    39  )
    40  
    41  // Detector finds instances of Secrets inside a chunk of text.
    42  //
    43  // While most commonly a detector will emit one specific type of secret, we also
    44  // allow for Detectors to return multiple distinct types - thus the []Secret
    45  // return type.
    46  type Detector interface {
    47  	// MaxSecretLen is the maximum length a secret from this detector can have.
    48  	//
    49  	// It can be set to 0 but then the detector isn't guaranteed any minimum input
    50  	// length and should instead implement its own mechanism to ensure it can find
    51  	// its secrets; i.e. maintain an internal buffer.
    52  	MaxSecretLen() uint32
    53  	// Detect finds Secrets inside data and returns them alongside indices to the
    54  	// start of the corresponding match.
    55  	// The latter is only used internally to avoid duplicates.
    56  	Detect(data []byte) ([]Secret, []int)
    57  }
    58  
    59  // DetectionEngine combines multiple Veles Detectors into a single engine that
    60  // can read from an io.Reader. It outputs the results of applying all Detectors
    61  // to that stream.
    62  //
    63  // Future optimizations might change how the engine works under the hood but its
    64  // API should stay stable. It makes no guarantee about the order in which
    65  // Secrets are found so calling code should not depend on it.
    66  type DetectionEngine struct {
    67  	ds []Detector
    68  
    69  	// readLen is the buffer size used for reading chunks from an io.Reader.
    70  	readLen int
    71  
    72  	// retainLen is the buffer size used for keeping parts of a previous read
    73  	// to avoid false negatives at the edge of two neighboring chunks.
    74  	retainLen int
    75  }
    76  
    77  // DetectionEngineOption is an option to configure a DetectionEngine during
    78  // creation via NewDetectionEngine.
    79  //
    80  // This allows user to fine tune the engine by overriding its defaults. For most
    81  // use-cases, the defaults should be sensible enough.
    82  type DetectionEngineOption func(*DetectionEngine)
    83  
    84  // WithReadLen overrides the buffer size used for reading chunks from io.Reader.
    85  //
    86  // The value can be smaller than MinReadLen.
    87  func WithReadLen(readLen uint32) DetectionEngineOption {
    88  	return func(e *DetectionEngine) {
    89  		e.readLen = int(readLen)
    90  	}
    91  }
    92  
    93  // WithRetainLen overrides the buffer size used for keeping parts of a previous
    94  // read to avoid false negatives at the edge of two neighboring chunks.
    95  //
    96  // These should usually be small against readLen. While it's technically
    97  // possible to have retainLen > readLen, that doesn't make a lot of semantic
    98  // sense and should be avoided.
    99  func WithRetainLen(retainLen uint32) DetectionEngineOption {
   100  	return func(e *DetectionEngine) {
   101  		e.retainLen = int(retainLen)
   102  	}
   103  }
   104  
   105  // NewDetectionEngine creates a new DetectionEngine with the given Detectors.
   106  //
   107  // This will choose sensible defaults for the internal buffers but those can
   108  // be overridden via DetectionEngineOptions if needed.
   109  //
   110  // Returns an error if no detectors are provided or if the retain buffer would
   111  // be too small to accommodate the detectors.
   112  func NewDetectionEngine(ds []Detector, opts ...DetectionEngineOption) (*DetectionEngine, error) {
   113  	if len(ds) == 0 {
   114  		return nil, errors.New("cannot create DetectionEngine without Detectors")
   115  	}
   116  	maxSecretLen := uint32(0)
   117  	for _, d := range ds {
   118  		m := d.MaxSecretLen()
   119  		if m > maxSecretLen {
   120  			maxSecretLen = m
   121  		}
   122  	}
   123  	defaultLen := int(nextPowerOf2(maxSecretLen))
   124  	e := &DetectionEngine{
   125  		ds:        ds,
   126  		readLen:   max(MinReadLen, defaultLen),
   127  		retainLen: max(MinRetainLen, defaultLen),
   128  	}
   129  	for _, opt := range opts {
   130  		opt(e)
   131  	}
   132  	if e.retainLen < int(maxSecretLen) {
   133  		return nil, fmt.Errorf("cannot create detection engine with retainLen (%d) < max secret len (%d)", e.retainLen, maxSecretLen)
   134  	}
   135  	return e, nil
   136  }
   137  
   138  // Detect reads from an io.Reader and returns the results of applying all of the
   139  // DetectionEngine's Detectors to that stream.
   140  //
   141  // It reads the input stream in chunks making sure that no matches are
   142  // accidentally missed at the edges between chunks.
   143  //
   144  // The secrets are returned in no particular order and calling code should not
   145  // depend on it (hyrumslaw.com).
   146  //
   147  // An error is returned if the provided context is done or if the io.Reader
   148  // returned any error other than io.EOF.
   149  func (e *DetectionEngine) Detect(ctx context.Context, r io.Reader) ([]Secret, error) {
   150  	if err := ctx.Err(); err != nil {
   151  		return nil, err
   152  	}
   153  	buf := make([]byte, 0, e.readLen+e.retainLen)
   154  	// Fill up the entire buffer on the first Read. This is the only time the
   155  	// engine reads more than readLen at once.
   156  	n, err := io.ReadFull(r, buf[:cap(buf)])
   157  	if err != nil {
   158  		if !isEOF(err) {
   159  			return nil, err
   160  		}
   161  		// Reader is already exhausted. No deduplication needed.
   162  		return e.detectAll(buf[:n]), nil
   163  	}
   164  	secrets := e.detectLeft(buf[:cap(buf)])
   165  	for {
   166  		if err := ctx.Err(); err != nil {
   167  			return secrets, err
   168  		}
   169  		// At this point the buffer is filled up to its cap because of io.ReadFull.
   170  		// We retain the last e.retainLen bytes from the end and read in another
   171  		// chunk of e.readLen.
   172  		copy(buf[:e.retainLen], buf[e.readLen:cap(buf)])
   173  		n, err := io.ReadFull(r, buf[e.retainLen:cap(buf)])
   174  		if err != nil {
   175  			if !isEOF(err) {
   176  				return nil, err
   177  			}
   178  			secrets = append(secrets, e.detectRight(buf[:e.retainLen+n])...)
   179  			break
   180  		}
   181  		secrets = append(secrets, e.detectMiddle(buf[:cap(buf)])...)
   182  	}
   183  	return secrets, nil
   184  }
   185  
   186  // detectAll returns all instances of Secrets inside the entire data buffer
   187  // found by the Engine's Detectors.
   188  func (e *DetectionEngine) detectAll(data []byte) []Secret {
   189  	var secrets []Secret
   190  	for _, d := range e.ds {
   191  		ss, _ := d.Detect(data)
   192  		secrets = append(secrets, ss...)
   193  	}
   194  	return secrets
   195  }
   196  
   197  // detectLeft returns all instances of Secrets inside the left part of the data
   198  // buffer that begin at a position before d.MaxSecretLen from the end for each
   199  // Detector d.
   200  func (e *DetectionEngine) detectLeft(data []byte) []Secret {
   201  	var secrets []Secret
   202  	for _, d := range e.ds {
   203  		right := len(data) - int(d.MaxSecretLen())
   204  		ss, ps := d.Detect(data)
   205  		for i, s := range ss {
   206  			p := ps[i]
   207  			if p >= right {
   208  				continue
   209  			}
   210  			secrets = append(secrets, s)
   211  		}
   212  	}
   213  	return secrets
   214  }
   215  
   216  // detectRight returns all instances of Secrets inside in the right part of the
   217  // data buffer at a position greater than the respective Detector's
   218  // MaxSecretLen.
   219  // This is only ever called with len(data) >= d.MaxSecretLen() for all d.
   220  func (e *DetectionEngine) detectRight(data []byte) []Secret {
   221  	var secrets []Secret
   222  	for _, d := range e.ds {
   223  		left := e.retainLen - int(d.MaxSecretLen())
   224  		ss, _ := d.Detect(data[left:])
   225  		secrets = append(secrets, ss...)
   226  	}
   227  	return secrets
   228  }
   229  
   230  // detectMiddle returns all instances of Secrets inside the data buffer coming
   231  // from the relevant window for each respective Detector.
   232  // The relevant window is the section of data that contains no redundant Secrets
   233  // for a given detector: i.e. only consider MaxSecretLen to the left and ignore
   234  // secrets that start after MaxSecretLen from the right.
   235  func (e *DetectionEngine) detectMiddle(data []byte) []Secret {
   236  	var secrets []Secret
   237  	for _, d := range e.ds {
   238  		left := e.retainLen - int(d.MaxSecretLen())
   239  		right := len(data) - left - int(d.MaxSecretLen())
   240  		ss, ps := d.Detect(data[left:])
   241  		for i, s := range ss {
   242  			p := ps[i]
   243  			if p >= right {
   244  				continue
   245  			}
   246  			secrets = append(secrets, s)
   247  		}
   248  	}
   249  	return secrets
   250  }
   251  
   252  func nextPowerOf2(x uint32) uint32 {
   253  	x--
   254  	x |= x >> 1
   255  	x |= x >> 2
   256  	x |= x >> 4
   257  	x |= x >> 8
   258  	x |= x >> 16
   259  	x++
   260  	return x
   261  }
   262  
   263  func isEOF(err error) bool {
   264  	return errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)
   265  }