sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/sidecar/censor.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sidecar 18 19 import ( 20 "archive/tar" 21 "compress/gzip" 22 "context" 23 "encoding/json" 24 "fmt" 25 "io" 26 "net/http" 27 "os" 28 "path/filepath" 29 "strings" 30 "sync" 31 "time" 32 33 "github.com/mattn/go-zglob" 34 "github.com/sirupsen/logrus" 35 "golang.org/x/sync/semaphore" 36 "gopkg.in/ini.v1" 37 kerrors "k8s.io/apimachinery/pkg/util/errors" 38 39 "sigs.k8s.io/prow/pkg/secretutil" 40 ) 41 42 // defaultBufferSize is the default buffer size, 10MiB. 43 const defaultBufferSize = 10 * 1024 * 1024 44 45 func (o Options) censor() error { 46 logrus.Info("Starting to censor data") 47 startTime := time.Now() 48 defer func() { logrus.WithField("duration", time.Since(startTime).String()).Info("Finished censoring data") }() 49 50 var concurrency int64 51 if o.CensoringOptions.CensoringConcurrency == nil { 52 concurrency = int64(10) 53 } else { 54 concurrency = *o.CensoringOptions.CensoringConcurrency 55 } 56 logrus.WithField("concurrency", concurrency).Debug("Censoring artifacts.") 57 sem := semaphore.NewWeighted(concurrency) 58 wg := &sync.WaitGroup{} 59 errors := make(chan error) 60 var errs []error 61 errLock := &sync.Mutex{} 62 go func() { 63 errLock.Lock() 64 for err := range errors { 65 errs = append(errs, err) 66 } 67 errLock.Unlock() 68 }() 69 70 secrets, err := loadSecrets(o.CensoringOptions.SecretDirectories, o.CensoringOptions.IniFilenames) 71 if err != nil { 72 // TODO(petr-muller): This return makes the censoring mechanism fragile, single failure in `loadSecrets` 73 // will prevent us from censoring all other secrets that were successfully loaded. Alternatively, 74 // we could be more strict and just bail out at our callsite in run.go:preUpload() instead of just 75 // emitting a warning there. But failing fast combined with just warning about the failure is not 76 // a sound approach for a secret-censoring mechanism. 77 return fmt.Errorf("could not load secrets: %w", err) 78 } 79 logrus.WithField("secrets", len(secrets)).Debug("Loaded secrets to censor.") 80 censorer := secretutil.NewCensorer() 81 censorer.RefreshBytes(secrets...) 82 83 bufferSize := defaultBufferSize 84 if o.CensoringOptions.CensoringBufferSize != nil { 85 bufferSize = *o.CensoringOptions.CensoringBufferSize 86 } 87 if largest := censorer.LargestSecret(); 2*largest > bufferSize { 88 bufferSize = 2 * largest 89 } 90 logrus.WithField("buffer_size", bufferSize).Debug("Determined censoring buffer size.") 91 censorFile := fileCensorer(sem, errors, censorer, bufferSize) 92 censor := func(file string) { 93 censorFile(wg, file) 94 } 95 96 for _, entry := range o.Entries { 97 logPath := entry.ProcessLog 98 censor(logPath) 99 } 100 101 for _, item := range o.GcsOptions.Items { 102 if err := filepath.Walk(item, func(absPath string, info os.FileInfo, err error) error { 103 // This method must never return an error, all files must be processed otherwise we may end up not censoring some 104 // files that are eventually uploaded 105 if err != nil { 106 errors <- err 107 return nil 108 } 109 if info.IsDir() || info.Mode()&os.ModeSymlink == os.ModeSymlink { 110 return nil 111 } 112 logger := logrus.WithField("path", absPath) 113 relpath, shouldNotErr := filepath.Rel(item, absPath) 114 if shouldNotErr != nil { 115 logrus.WithError(shouldNotErr).Warnf("filepath.Rel returned an error, but we assumed there must be a relative path between %s and %s", item, absPath) 116 } 117 should, err := shouldCensor(*o.CensoringOptions, relpath) 118 if err != nil { 119 errors <- fmt.Errorf("could not determine if we should censor path: %w", err) 120 return nil 121 } 122 if !should { 123 return nil 124 } 125 126 contentType, err := determineContentType(absPath) 127 if err != nil { 128 errors <- fmt.Errorf("could not determine content type of %s: %w", absPath, err) 129 return nil 130 } 131 132 switch contentType { 133 case "application/x-gzip", "application/zip": 134 logger.Debug("Censoring archive.") 135 if err := handleArchive(absPath, censorFile); err != nil { 136 errors <- fmt.Errorf("could not censor archive %s: %w", absPath, err) 137 return nil 138 } 139 default: 140 logger.Debug("Censoring file.") 141 censor(absPath) 142 } 143 return nil 144 }); err != nil { 145 // This should never happen because the WalkFunc above is not supposed to return an error 146 // but if it somehow does, let's be defensive and log it 147 // DO NOT RETURN so that we continue to iterate o.GcsOptions 148 errors <- fmt.Errorf("could not walk items to censor them: %w", err) 149 } 150 } 151 152 wg.Wait() 153 close(errors) 154 errLock.Lock() 155 return kerrors.NewAggregate(errs) 156 } 157 158 func shouldCensor(options CensoringOptions, path string) (bool, error) { 159 for _, glob := range options.ExcludeDirectories { 160 found, err := zglob.Match(glob, path) 161 if err != nil { 162 return false, err 163 } 164 if found { 165 return false, nil // when explicitly excluded, do not censor 166 } 167 } 168 for _, glob := range options.IncludeDirectories { 169 found, err := zglob.Match(glob, path) 170 if err != nil { 171 return false, err 172 } 173 if found { 174 return true, nil // when explicitly included, censor 175 } 176 } 177 return len(options.IncludeDirectories) == 0, nil // censor if no explicit includes exist 178 } 179 180 // fileCensorer returns a closure over all of our synchronization for a clean handler signature 181 func fileCensorer(sem *semaphore.Weighted, errors chan<- error, censorer secretutil.Censorer, bufferSize int) func(wg *sync.WaitGroup, file string) { 182 return func(wg *sync.WaitGroup, file string) { 183 wg.Add(1) 184 go func() { 185 if err := sem.Acquire(context.Background(), 1); err != nil { 186 errors <- err 187 return 188 } 189 defer sem.Release(1) 190 defer wg.Done() 191 errors <- handleFile(file, censorer, bufferSize) 192 }() 193 } 194 } 195 196 // determineContentType determines the content type of the file 197 func determineContentType(path string) (string, error) { 198 file, err := os.Open(path) 199 if err != nil { 200 return "", fmt.Errorf("could not open file to check content type: %w", err) 201 } 202 defer func() { 203 if err := file.Close(); err != nil { 204 logrus.WithError(err).Warn("Could not close input file while detecting content type.") 205 } 206 }() 207 208 header := make([]byte, 512) 209 if _, err := file.Read(header); err != nil && err != io.EOF { 210 return "", fmt.Errorf("could not read file to check content type: %w", err) 211 } 212 return http.DetectContentType(header), nil 213 } 214 215 // handleArchive unravels the archive in order to censor data in the files that were added to it. 216 // This is mostly stolen from build/internal/untar/untar.go 217 func handleArchive(archivePath string, censor func(wg *sync.WaitGroup, file string)) error { 218 outputDir, err := os.MkdirTemp("", "tmp-unpack") 219 if err != nil { 220 return fmt.Errorf("could not create temporary dir for unpacking: %w", err) 221 } 222 223 defer func() { 224 if err := os.RemoveAll(outputDir); err != nil { 225 logrus.WithError(err).Warn("Failed to clean up temporary directory for archive") 226 } 227 }() 228 229 if err := unarchive(archivePath, outputDir); err != nil { 230 return fmt.Errorf("could not unpack archive: %w", err) 231 } 232 233 children := &sync.WaitGroup{} 234 if err := filepath.Walk(outputDir, func(absPath string, info os.FileInfo, err error) error { 235 if info.IsDir() { 236 return nil 237 } 238 239 censor(children, absPath) 240 return nil 241 }); err != nil { 242 return fmt.Errorf("could not walk unpacked archive to censor them: %w", err) 243 } 244 245 children.Wait() 246 if err := archive(outputDir, archivePath); err != nil { 247 return fmt.Errorf("could not re-pack archive: %w", err) 248 } 249 return nil 250 } 251 252 // unarchive unpacks the archive into the destination 253 func unarchive(archivePath, destPath string) error { 254 input, err := os.Open(archivePath) 255 if err != nil { 256 return fmt.Errorf("could not open archive for unpacking: %w", err) 257 } 258 zipReader, err := gzip.NewReader(input) 259 if err != nil { 260 return fmt.Errorf("could not read archive: %w", err) 261 } 262 tarReader := tar.NewReader(zipReader) 263 defer func() { 264 if err := zipReader.Close(); err != nil { 265 logrus.WithError(err).Warn("Could not close zip reader after unarchiving.") 266 } 267 if err := input.Close(); err != nil { 268 logrus.WithError(err).Warn("Could not close input file after unarchiving.") 269 } 270 }() 271 272 for { 273 entry, err := tarReader.Next() 274 if err == io.EOF { 275 break 276 } 277 if err != nil { 278 return fmt.Errorf("could not read archive: %w", err) 279 } 280 if !validRelPath(entry.Name) { 281 return fmt.Errorf("tar contained invalid name error %q", entry.Name) 282 } 283 rel := filepath.FromSlash(entry.Name) 284 abs := filepath.Join(destPath, rel) 285 mode := entry.FileInfo().Mode() 286 switch { 287 case mode.IsDir(): 288 if err := os.MkdirAll(abs, 0755); err != nil { 289 return fmt.Errorf("could not create directory while unpacking archive: %w", err) 290 } 291 case mode.IsRegular(): 292 file, err := os.OpenFile(abs, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode.Perm()) 293 if err != nil { 294 return err 295 } 296 n, err := io.Copy(file, tarReader) 297 if closeErr := file.Close(); closeErr != nil && err == nil { 298 return fmt.Errorf("error closing %s: %w", abs, closeErr) 299 } 300 if err != nil { 301 return fmt.Errorf("error writing to %s: %w", abs, err) 302 } 303 if n != entry.Size { 304 return fmt.Errorf("only wrote %d bytes to %s; expected %d", n, abs, entry.Size) 305 } 306 } 307 } 308 return nil 309 } 310 311 func validRelPath(p string) bool { 312 if p == "" || strings.Contains(p, `\`) || strings.HasPrefix(p, "/") || strings.Contains(p, "../") { 313 return false 314 } 315 return true 316 } 317 318 // archive re-packs the dir into the destination 319 func archive(srcDir, destArchive string) error { 320 // we want the temporary file we use for output to be in the same directory as the real destination, so 321 // we can be certain that our final os.Rename() call will not have to operate across a device boundary 322 output, err := os.CreateTemp(filepath.Dir(destArchive), "tmp-archive") 323 if err != nil { 324 return fmt.Errorf("failed to create temporary file for archive: %w", err) 325 } 326 327 zipWriter := gzip.NewWriter(output) 328 tarWriter := tar.NewWriter(zipWriter) 329 defer func() { 330 if err := tarWriter.Close(); err != nil { 331 logrus.WithError(err).Warn("Could not close tar writer after archiving.") 332 } 333 if err := zipWriter.Close(); err != nil { 334 logrus.WithError(err).Warn("Could not close zip writer after archiving.") 335 } 336 if err := output.Close(); err != nil { 337 logrus.WithError(err).Warn("Could not close output file after archiving.") 338 } 339 }() 340 341 if err := filepath.Walk(srcDir, func(absPath string, info os.FileInfo, err error) error { 342 if err != nil { 343 return err 344 } 345 346 // Handle symlinks. See https://stackoverflow.com/a/40003617. 347 var link string 348 if info.Mode()&os.ModeSymlink == os.ModeSymlink { 349 if link, err = os.Readlink(absPath); err != nil { 350 return err 351 } 352 } 353 354 // "link" is only used by FileInfoHeader if "info" here is a symlink. 355 // See https://pkg.go.dev/archive/tar#FileInfoHeader. 356 header, err := tar.FileInfoHeader(info, link) 357 if err != nil { 358 return fmt.Errorf("could not create tar header: %w", err) 359 } 360 // the header won't get nested paths right 361 relpath, shouldNotErr := filepath.Rel(srcDir, absPath) 362 if shouldNotErr != nil { 363 logrus.WithError(shouldNotErr).Warnf("filepath.Rel returned an error, but we assumed there must be a relative path between %s and %s", srcDir, absPath) 364 } 365 header.Name = relpath 366 if err := tarWriter.WriteHeader(header); err != nil { 367 return fmt.Errorf("could not write tar header: %w", err) 368 } 369 if info.IsDir() { 370 return nil 371 } 372 373 // Nothing more to do for non-regular files (symlinks). 374 if !info.Mode().IsRegular() { 375 return nil 376 } 377 378 file, err := os.Open(absPath) 379 if err != nil { 380 return fmt.Errorf("could not open source file: %w", err) 381 } 382 n, err := io.Copy(tarWriter, file) 383 if err != nil { 384 return fmt.Errorf("could not tar file: %w", err) 385 } 386 if n != info.Size() { 387 return fmt.Errorf("only wrote %d bytes from %s; expected %d", n, absPath, info.Size()) 388 } 389 if err := file.Close(); err != nil { 390 return fmt.Errorf("could not close source file: %w", err) 391 } 392 return nil 393 }); err != nil { 394 return fmt.Errorf("could not walk source files to archive them: %w", err) 395 } 396 397 if err := os.Rename(output.Name(), destArchive); err != nil { 398 return fmt.Errorf("could not overwrite archive: %w", err) 399 } 400 401 return nil 402 } 403 404 // handleFile censors the content of a file by streaming it to a new location, then overwriting the previous 405 // location, to make it seem like this happened in place on the filesystem 406 func handleFile(path string, censorer secretutil.Censorer, bufferSize int) error { 407 input, err := os.Open(path) 408 if err != nil { 409 return fmt.Errorf("could not open file for censoring: %w", err) 410 } 411 412 // we want the temporary file we use for output to be in the same directory as the real destination, so 413 // we can be certain that our final os.Rename() call will not have to operate across a device boundary 414 output, err := os.CreateTemp(filepath.Dir(path), "tmp-censor") 415 if err != nil { 416 return fmt.Errorf("could not create temporary file for censoring: %w", err) 417 } 418 419 if err := censor(input, output, censorer, bufferSize); err != nil { 420 return fmt.Errorf("could not censor file: %w", err) 421 } 422 423 if err := os.Rename(output.Name(), path); err != nil { 424 return fmt.Errorf("could not overwrite file after censoring: %w", err) 425 } 426 427 return nil 428 } 429 430 // censor censors input data and streams it to the output. We have a memory footprint of bufferSize bytes. 431 func censor(input io.ReadCloser, output io.WriteCloser, censorer secretutil.Censorer, bufferSize int) error { 432 if bufferSize%2 != 0 { 433 return fmt.Errorf("frame size must be even, not %d", bufferSize) 434 } 435 defer func() { 436 if err := input.Close(); err != nil { 437 logrus.WithError(err).Warn("Could not close input file after censoring.") 438 } 439 if err := output.Close(); err != nil { 440 logrus.WithError(err).Warn("Could not close output file after censoring.") 441 } 442 }() 443 444 buffer := make([]byte, bufferSize) 445 frameSize := bufferSize / 2 446 // bootstrap the algorithm by reading in the first half-frame 447 numInitialized, initializeErr := input.Read(buffer[:frameSize]) 448 // handle read errors - if we read everything in this init step, the next read will return 0, EOF and 449 // we can flush appropriately as part of the process loop 450 if initializeErr != nil && initializeErr != io.EOF { 451 return fmt.Errorf("could not read data from input file before censoring: %w", initializeErr) 452 } 453 frameSize = numInitialized // this will normally be bufferSize/2 but will be smaller at the end of the file 454 for { 455 // populate the second half of the buffer with new data 456 numRead, readErr := input.Read(buffer[frameSize:]) 457 if readErr != nil && readErr != io.EOF { 458 return fmt.Errorf("could not read data from input file before censoring: %w", readErr) 459 } 460 // censor the full buffer and flush the first half to the output 461 censorer.Censor(&buffer) 462 numWritten, writeErr := output.Write(buffer[:frameSize]) 463 if writeErr != nil { 464 return fmt.Errorf("could not write data to output file after censoring: %w", writeErr) 465 } 466 if numWritten != frameSize { 467 // TODO: we could retry here I guess? When would a filesystem write less than expected and not error? 468 return fmt.Errorf("only wrote %d out of %d bytes after censoring", numWritten, frameSize) 469 } 470 // shift the buffer over and get ready to repopulate the rest with new data 471 copy(buffer[:numRead], buffer[frameSize:frameSize+numRead]) 472 frameSize = numRead 473 if readErr == io.EOF { 474 break 475 } 476 } 477 return nil 478 } 479 480 // loadSecrets loads all files under the paths into memory 481 func loadSecrets(paths, iniFilenames []string) ([][]byte, error) { 482 var secrets [][]byte 483 for _, path := range paths { 484 if err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { 485 if err != nil { 486 return err 487 } 488 if strings.HasPrefix(info.Name(), "..") { 489 // kubernetes volumes also include files we 490 // should not look be looking into for keys 491 if info.IsDir() { 492 return filepath.SkipDir 493 } 494 return nil 495 } 496 if info.IsDir() { 497 return nil 498 } 499 raw, err := os.ReadFile(path) 500 if err != nil { 501 return err 502 } 503 secrets = append(secrets, raw) 504 // In many cases, a secret file contains much more than just the sensitive data. For instance, 505 // container registry credentials files are JSON formatted, so there are only a couple of fields 506 // that are truly secret, the rest is formatting and whitespace. The implication here is that 507 // a censoring approach that only looks at the full, uninterrupted secret value will not be able 508 // to censor anything if that value is reformatted, truncated, etc. When the secrets we are asked 509 // to censor are container registry credentials, we can know the format of these files and extract 510 // the subsets of data that are sensitive, allowing us not only to censor the full file's contents 511 // but also any individual fields that exist in the output, whether they're there due to a user 512 // extracting the fields or output being truncated, etc. 513 var parser = func(bytes []byte) ([]string, error) { 514 return nil, nil 515 } 516 if info.Name() == ".dockercfg" { 517 parser = loadDockercfgAuths 518 } 519 if info.Name() == ".dockerconfigjson" { 520 parser = loadDockerconfigJsonAuths 521 } 522 for _, filename := range iniFilenames { 523 if info.Name() == filename { 524 parser = loadIniData 525 break 526 } 527 } 528 extra, parseErr := parser(raw) 529 if parseErr != nil { 530 return fmt.Errorf("could not read %s as a docker secret: %w", path, parseErr) 531 } 532 // It is important that these are added to the list of secrets *after* their parent data 533 // as we will censor in order and this will give a reasonable guarantee that the parent 534 // data (a superset of any of these fields) will be censored in its entirety, first. It 535 // remains possible that the sliding window used to censor pulls in only part of the 536 // superset and some small part of it is censored first, making the larger superset no 537 // longer match the file being censored. 538 for _, item := range extra { 539 secrets = append(secrets, []byte(item)) 540 } 541 return nil 542 }); err != nil { 543 return nil, err 544 } 545 } 546 return secrets, nil 547 } 548 549 // loadDockercfgAuths parses auth values from a kubernetes.io/dockercfg secret 550 func loadDockercfgAuths(content []byte) ([]string, error) { 551 var data map[string]authEntry 552 if err := json.Unmarshal(content, &data); err != nil { 553 return nil, err 554 } 555 var entries []authEntry 556 for _, entry := range data { 557 entries = append(entries, entry) 558 } 559 return collectSecretsFrom(entries), nil 560 } 561 562 // loadDockerconfigJsonAuths parses auth values from a kubernetes.io/dockercfgjson secret 563 func loadDockerconfigJsonAuths(content []byte) ([]string, error) { 564 var data = struct { 565 Auths map[string]authEntry `json:"auths"` 566 }{} 567 if err := json.Unmarshal(content, &data); err != nil { 568 return nil, err 569 } 570 var entries []authEntry 571 for _, entry := range data.Auths { 572 entries = append(entries, entry) 573 } 574 return collectSecretsFrom(entries), nil 575 } 576 577 // authEntry holds credentials for authentication to registries 578 type authEntry struct { 579 Password string `json:"password"` 580 Auth string `json:"auth"` 581 } 582 583 func collectSecretsFrom(entries []authEntry) []string { 584 var auths []string 585 for _, entry := range entries { 586 if entry.Auth != "" { 587 auths = append(auths, entry.Auth) 588 } 589 if entry.Password != "" { 590 auths = append(auths, entry.Password) 591 } 592 } 593 return auths 594 } 595 596 func handleSection(section *ini.Section, extra []string) []string { 597 for _, subsection := range section.ChildSections() { 598 extra = handleSection(subsection, extra) 599 } 600 for _, key := range section.Keys() { 601 extra = append(extra, key.Value()) 602 } 603 return extra 604 } 605 606 // loadIniData parses key-value data from an INI file 607 func loadIniData(content []byte) ([]string, error) { 608 cfg, err := ini.Load(content) 609 if err != nil { 610 return nil, err 611 } 612 613 var extra []string 614 for _, section := range cfg.Sections() { 615 extra = handleSection(section, extra) 616 } 617 return extra, nil 618 }