github.com/google/osv-scalibr@v0.4.1/veles/detect.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package veles 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 ) 23 24 const ( 25 // KiB is one binary Kilobyte (Kibibyte) i.e. 1024 bytes. 26 KiB = 1 << 10 27 // MiB is one binary Megabyte (Mibibyte). 28 MiB = 1 << 20 29 // GiB is one binary Gigabyte (Gibibyte). 30 GiB = 1 << 30 31 32 // MinReadLen is the minimum buffer size for reading chunks from an io.Reader. 33 MinReadLen = 64 * KiB 34 35 // MinRetainLen is the minimum number of bytes from the end of a chunk to 36 // retain to avoid false negatives from Secrets overlapping the edge of two 37 // chunks. 38 MinRetainLen = 1 * KiB 39 ) 40 41 // Detector finds instances of Secrets inside a chunk of text. 42 // 43 // While most commonly a detector will emit one specific type of secret, we also 44 // allow for Detectors to return multiple distinct types - thus the []Secret 45 // return type. 46 type Detector interface { 47 // MaxSecretLen is the maximum length a secret from this detector can have. 48 // 49 // It can be set to 0 but then the detector isn't guaranteed any minimum input 50 // length and should instead implement its own mechanism to ensure it can find 51 // its secrets; i.e. maintain an internal buffer. 52 MaxSecretLen() uint32 53 // Detect finds Secrets inside data and returns them alongside indices to the 54 // start of the corresponding match. 55 // The latter is only used internally to avoid duplicates. 56 Detect(data []byte) ([]Secret, []int) 57 } 58 59 // DetectionEngine combines multiple Veles Detectors into a single engine that 60 // can read from an io.Reader. It outputs the results of applying all Detectors 61 // to that stream. 62 // 63 // Future optimizations might change how the engine works under the hood but its 64 // API should stay stable. It makes no guarantee about the order in which 65 // Secrets are found so calling code should not depend on it. 66 type DetectionEngine struct { 67 ds []Detector 68 69 // readLen is the buffer size used for reading chunks from an io.Reader. 70 readLen int 71 72 // retainLen is the buffer size used for keeping parts of a previous read 73 // to avoid false negatives at the edge of two neighboring chunks. 74 retainLen int 75 } 76 77 // DetectionEngineOption is an option to configure a DetectionEngine during 78 // creation via NewDetectionEngine. 79 // 80 // This allows user to fine tune the engine by overriding its defaults. For most 81 // use-cases, the defaults should be sensible enough. 82 type DetectionEngineOption func(*DetectionEngine) 83 84 // WithReadLen overrides the buffer size used for reading chunks from io.Reader. 85 // 86 // The value can be smaller than MinReadLen. 87 func WithReadLen(readLen uint32) DetectionEngineOption { 88 return func(e *DetectionEngine) { 89 e.readLen = int(readLen) 90 } 91 } 92 93 // WithRetainLen overrides the buffer size used for keeping parts of a previous 94 // read to avoid false negatives at the edge of two neighboring chunks. 95 // 96 // These should usually be small against readLen. While it's technically 97 // possible to have retainLen > readLen, that doesn't make a lot of semantic 98 // sense and should be avoided. 99 func WithRetainLen(retainLen uint32) DetectionEngineOption { 100 return func(e *DetectionEngine) { 101 e.retainLen = int(retainLen) 102 } 103 } 104 105 // NewDetectionEngine creates a new DetectionEngine with the given Detectors. 106 // 107 // This will choose sensible defaults for the internal buffers but those can 108 // be overridden via DetectionEngineOptions if needed. 109 // 110 // Returns an error if no detectors are provided or if the retain buffer would 111 // be too small to accommodate the detectors. 112 func NewDetectionEngine(ds []Detector, opts ...DetectionEngineOption) (*DetectionEngine, error) { 113 if len(ds) == 0 { 114 return nil, errors.New("cannot create DetectionEngine without Detectors") 115 } 116 maxSecretLen := uint32(0) 117 for _, d := range ds { 118 m := d.MaxSecretLen() 119 if m > maxSecretLen { 120 maxSecretLen = m 121 } 122 } 123 defaultLen := int(nextPowerOf2(maxSecretLen)) 124 e := &DetectionEngine{ 125 ds: ds, 126 readLen: max(MinReadLen, defaultLen), 127 retainLen: max(MinRetainLen, defaultLen), 128 } 129 for _, opt := range opts { 130 opt(e) 131 } 132 if e.retainLen < int(maxSecretLen) { 133 return nil, fmt.Errorf("cannot create detection engine with retainLen (%d) < max secret len (%d)", e.retainLen, maxSecretLen) 134 } 135 return e, nil 136 } 137 138 // Detect reads from an io.Reader and returns the results of applying all of the 139 // DetectionEngine's Detectors to that stream. 140 // 141 // It reads the input stream in chunks making sure that no matches are 142 // accidentally missed at the edges between chunks. 143 // 144 // The secrets are returned in no particular order and calling code should not 145 // depend on it (hyrumslaw.com). 146 // 147 // An error is returned if the provided context is done or if the io.Reader 148 // returned any error other than io.EOF. 149 func (e *DetectionEngine) Detect(ctx context.Context, r io.Reader) ([]Secret, error) { 150 if err := ctx.Err(); err != nil { 151 return nil, err 152 } 153 buf := make([]byte, 0, e.readLen+e.retainLen) 154 // Fill up the entire buffer on the first Read. This is the only time the 155 // engine reads more than readLen at once. 156 n, err := io.ReadFull(r, buf[:cap(buf)]) 157 if err != nil { 158 if !isEOF(err) { 159 return nil, err 160 } 161 // Reader is already exhausted. No deduplication needed. 162 return e.detectAll(buf[:n]), nil 163 } 164 secrets := e.detectLeft(buf[:cap(buf)]) 165 for { 166 if err := ctx.Err(); err != nil { 167 return secrets, err 168 } 169 // At this point the buffer is filled up to its cap because of io.ReadFull. 170 // We retain the last e.retainLen bytes from the end and read in another 171 // chunk of e.readLen. 172 copy(buf[:e.retainLen], buf[e.readLen:cap(buf)]) 173 n, err := io.ReadFull(r, buf[e.retainLen:cap(buf)]) 174 if err != nil { 175 if !isEOF(err) { 176 return nil, err 177 } 178 secrets = append(secrets, e.detectRight(buf[:e.retainLen+n])...) 179 break 180 } 181 secrets = append(secrets, e.detectMiddle(buf[:cap(buf)])...) 182 } 183 return secrets, nil 184 } 185 186 // detectAll returns all instances of Secrets inside the entire data buffer 187 // found by the Engine's Detectors. 188 func (e *DetectionEngine) detectAll(data []byte) []Secret { 189 var secrets []Secret 190 for _, d := range e.ds { 191 ss, _ := d.Detect(data) 192 secrets = append(secrets, ss...) 193 } 194 return secrets 195 } 196 197 // detectLeft returns all instances of Secrets inside the left part of the data 198 // buffer that begin at a position before d.MaxSecretLen from the end for each 199 // Detector d. 200 func (e *DetectionEngine) detectLeft(data []byte) []Secret { 201 var secrets []Secret 202 for _, d := range e.ds { 203 right := len(data) - int(d.MaxSecretLen()) 204 ss, ps := d.Detect(data) 205 for i, s := range ss { 206 p := ps[i] 207 if p >= right { 208 continue 209 } 210 secrets = append(secrets, s) 211 } 212 } 213 return secrets 214 } 215 216 // detectRight returns all instances of Secrets inside in the right part of the 217 // data buffer at a position greater than the respective Detector's 218 // MaxSecretLen. 219 // This is only ever called with len(data) >= d.MaxSecretLen() for all d. 220 func (e *DetectionEngine) detectRight(data []byte) []Secret { 221 var secrets []Secret 222 for _, d := range e.ds { 223 left := e.retainLen - int(d.MaxSecretLen()) 224 ss, _ := d.Detect(data[left:]) 225 secrets = append(secrets, ss...) 226 } 227 return secrets 228 } 229 230 // detectMiddle returns all instances of Secrets inside the data buffer coming 231 // from the relevant window for each respective Detector. 232 // The relevant window is the section of data that contains no redundant Secrets 233 // for a given detector: i.e. only consider MaxSecretLen to the left and ignore 234 // secrets that start after MaxSecretLen from the right. 235 func (e *DetectionEngine) detectMiddle(data []byte) []Secret { 236 var secrets []Secret 237 for _, d := range e.ds { 238 left := e.retainLen - int(d.MaxSecretLen()) 239 right := len(data) - left - int(d.MaxSecretLen()) 240 ss, ps := d.Detect(data[left:]) 241 for i, s := range ss { 242 p := ps[i] 243 if p >= right { 244 continue 245 } 246 secrets = append(secrets, s) 247 } 248 } 249 return secrets 250 } 251 252 func nextPowerOf2(x uint32) uint32 { 253 x-- 254 x |= x >> 1 255 x |= x >> 2 256 x |= x >> 4 257 x |= x >> 8 258 x |= x >> 16 259 x++ 260 return x 261 } 262 263 func isEOF(err error) bool { 264 return errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) 265 }