kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/kzip/kzip.go (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package kzip implements the kzip compilation storage file format. 18 // 19 // The package exports two types of interest: A kzip.Reader can be used to read 20 // the contents of an existing kzip archive, and a kzip.Writer can be used to 21 // construct a new kzip archive. 22 // 23 // Reading an Archive: 24 // 25 // r, err := kzip.NewReader(file, size) 26 // ... 27 // 28 // // Look up a compilation record by its digest. 29 // unit, err := r.Lookup(unitDigest) 30 // ... 31 // 32 // // Scan all the compilation records stored. 33 // err := r.Scan(func(unit *kzip.Unit) error { 34 // if hasInterestingProperty(unit) { 35 // doStuffWith(unit) 36 // } 37 // return nil 38 // }) 39 // 40 // // Open a reader for a stored file. 41 // rc, err := r.Open(fileDigest) 42 // ... 43 // defer rc.Close() 44 // 45 // // Read the complete contents of a stored file. 46 // bits, err := r.ReadAll(fileDigest) 47 // ... 48 // 49 // Writing an Archive: 50 // 51 // w, err := kzip.NewWriter(file) 52 // ... 53 // 54 // // Add a compilation record and (optional) index data. 55 // udigest, err := w.AddUnit(unit, nil) 56 // ... 57 // 58 // // Add file contents. 59 // fdigest, err := w.AddFile(file) 60 // ... 61 package kzip // import "kythe.io/kythe/go/platform/kzip" 62 63 import ( 64 "archive/zip" 65 "bytes" 66 "context" 67 "crypto/sha256" 68 "encoding/hex" 69 "errors" 70 "fmt" 71 "io" 72 "io/ioutil" 73 "os" 74 "path" 75 "sort" 76 "strconv" 77 "strings" 78 "sync" 79 "time" 80 81 "kythe.io/kythe/go/platform/kcd/kythe" 82 "kythe.io/kythe/go/util/log" 83 "kythe.io/kythe/go/util/ptypes" 84 85 "bitbucket.org/creachadair/stringset" 86 "github.com/golang/protobuf/proto" 87 "golang.org/x/sync/errgroup" 88 "google.golang.org/protobuf/encoding/protojson" 89 90 apb "kythe.io/kythe/proto/analysis_go_proto" 91 spb "kythe.io/kythe/proto/storage_go_proto" 92 93 // These are common detail messages used by Kythe compilations, and 94 // required for JSON (un)marshaling to work. 95 _ "kythe.io/kythe/proto/buildinfo_go_proto" 96 _ "kythe.io/kythe/proto/cxx_go_proto" 97 _ "kythe.io/kythe/proto/filecontext_go_proto" 98 _ "kythe.io/kythe/proto/go_go_proto" 99 _ "kythe.io/kythe/proto/java_go_proto" 100 ) 101 102 // Encoding describes how compilation units will be encoded when written to a kzip. 103 type Encoding int 104 105 const ( 106 // EncodingJSON specifies to use JSON encoding 107 EncodingJSON Encoding = 1 108 // EncodingProto specifies to use Proto encoding 109 EncodingProto Encoding = 2 110 // EncodingAll specifies to encode using all known encodings 111 EncodingAll Encoding = EncodingJSON | EncodingProto 112 113 prefixJSON = "units" 114 prefixProto = "pbunits" 115 ) 116 117 // Compilation is a CompilationUnit with the contents for all of its required inputs. 118 type Compilation struct { 119 Proto *apb.CompilationUnit `json:"compilation"` 120 Files []*apb.FileData `json:"files"` 121 } 122 123 var ( 124 // Use a constant file modification time in the kzip so file diffs only compare the contents, 125 // not when the kzips were created. 126 modifiedTime = time.Unix(0, 0) 127 ) 128 129 // EncodingFor converts a string to an Encoding. 130 func EncodingFor(v string) (Encoding, error) { 131 v = strings.ToUpper(v) 132 switch { 133 case v == "ALL": 134 return EncodingAll, nil 135 case v == "JSON": 136 return EncodingJSON, nil 137 case v == "PROTO": 138 return EncodingProto, nil 139 default: 140 return EncodingProto, fmt.Errorf("unknown encoding %s", v) 141 } 142 } 143 144 // String stringifies an Encoding 145 func (e Encoding) String() string { 146 switch { 147 case e == EncodingAll: 148 return "All" 149 case e == EncodingJSON: 150 return "JSON" 151 case e == EncodingProto: 152 return "Proto" 153 default: 154 return "Encoding" + strconv.FormatInt(int64(e), 10) 155 } 156 } 157 158 // DefaultEncoding returns the default kzip encoding 159 func DefaultEncoding() Encoding { 160 if e := os.Getenv("KYTHE_KZIP_ENCODING"); e != "" { 161 enc, err := EncodingFor(e) 162 if err == nil { 163 return enc 164 } 165 log.Errorf("unknown kzip encoding: %s", e) 166 } 167 return EncodingProto 168 } 169 170 // A Reader permits reading and scanning compilation records and file contents 171 // stored in a .kzip archive. The Lookup and Scan methods are mutually safe for 172 // concurrent use by multiple goroutines. 173 type Reader struct { 174 zip *zip.Reader 175 176 // The archives written by this library always use "root/" for the root 177 // directory, but it's not required by the spec. Use whatever name the 178 // archive actually specifies in the leading directory. 179 root string 180 181 // The prefix used for the compilation unit directory; one of 182 // prefixJSON or prefixProto 183 unitsPrefix string 184 } 185 186 // NewReader constructs a new Reader that consumes zip data from r, whose total 187 // size in bytes is given. 188 func NewReader(r io.ReaderAt, size int64) (*Reader, error) { 189 archive, err := zip.NewReader(r, size) 190 if err != nil { 191 return nil, err 192 } 193 // Order the files in the archive by path, so we can binary search. 194 sort.Slice(archive.File, func(i, j int) bool { 195 return archive.File[i].Name < archive.File[j].Name 196 }) 197 198 if len(archive.File) == 0 { 199 return nil, errors.New("archive is empty") 200 } else if fi := archive.File[0].FileInfo(); !fi.IsDir() { 201 return nil, fmt.Errorf("archive root directory missing: expected a directory but got %v - see https://kythe.io/docs/kythe-kzip.html#_directory_and_file_layout", archive.File[0].Name) 202 } 203 root := archive.File[0].Name 204 pref, err := unitPrefix(root, archive.File) 205 if err != nil { 206 return nil, err 207 } 208 return &Reader{ 209 zip: archive, 210 root: root, 211 unitsPrefix: pref, 212 }, nil 213 } 214 215 func unitPrefix(root string, fs []*zip.File) (string, error) { 216 jsonDir := root + prefixJSON + "/" 217 protoDir := root + prefixProto + "/" 218 j := sort.Search(len(fs), func(i int) bool { 219 return fs[i].Name > jsonDir 220 }) 221 hasJSON := j < len(fs) && strings.HasPrefix(fs[j].Name, jsonDir) 222 p := sort.Search(len(fs), func(i int) bool { 223 return fs[i].Name > protoDir 224 }) 225 hasProto := p < len(fs) && strings.HasPrefix(fs[p].Name, protoDir) 226 if hasJSON && hasProto { 227 // validate that they have identical units based on hash 228 for p < len(fs) && j < len(fs) { 229 ispb := strings.HasPrefix(fs[p].Name, protoDir) 230 isjson := strings.HasPrefix(fs[j].Name, jsonDir) 231 if ispb != isjson { 232 return "", fmt.Errorf("both proto and JSON units found but are not identical") 233 } 234 if !ispb { 235 break 236 } 237 pdigest := strings.Split(fs[p].Name, "/")[2] 238 jdigest := strings.Split(fs[j].Name, "/")[2] 239 if pdigest != jdigest { 240 return "", fmt.Errorf("both proto and JSON units found but are not identical") 241 } 242 p++ 243 j++ 244 } 245 } 246 if hasProto { 247 return prefixProto, nil 248 } 249 return prefixJSON, nil 250 } 251 252 // Encoding exposes the file encoding being used to read compilation units. 253 func (r *Reader) Encoding() (Encoding, error) { 254 switch { 255 case r.unitsPrefix == prefixJSON: 256 return EncodingJSON, nil 257 case r.unitsPrefix == prefixProto: 258 return EncodingProto, nil 259 } 260 return EncodingAll, fmt.Errorf("unknown encoding prefix: %v", r.unitsPrefix) 261 } 262 263 func (r *Reader) unitPath(digest string) string { return path.Join(r.root, r.unitsPrefix, digest) } 264 func (r *Reader) filePath(digest string) string { return path.Join(r.root, "files", digest) } 265 266 // ErrDigestNotFound is returned when a requested compilation unit or file 267 // digest is not found. 268 var ErrDigestNotFound = errors.New("digest not found") 269 270 // ErrUnitExists is returned by AddUnit when adding the same compilation 271 // multiple times. 272 var ErrUnitExists = errors.New("unit already exists") 273 274 func (r *Reader) readUnit(digest string, f *zip.File) (*Unit, error) { 275 rc, err := f.Open() 276 if err != nil { 277 return nil, err 278 } 279 rec := make([]byte, f.UncompressedSize64) 280 _, err = io.ReadFull(rc, rec) 281 rc.Close() 282 if err != nil { 283 return nil, err 284 } 285 var msg apb.IndexedCompilation 286 if r.unitsPrefix == prefixProto { 287 if err := proto.Unmarshal(rec, &msg); err != nil { 288 return nil, fmt.Errorf("error unmarshaling for %s: %s", digest, err) 289 } 290 } else if err := protojson.Unmarshal(rec, &msg); err != nil { 291 return nil, err 292 } 293 return &Unit{ 294 Digest: digest, 295 Proto: msg.Unit, 296 Index: msg.Index, 297 }, nil 298 } 299 300 // firstIndex returns the first index in the archive's file list whose 301 // path starts with prefix, or -1 if no such index exists. 302 func (r *Reader) firstIndex(prefix string) int { 303 fs := r.zip.File 304 n := sort.Search(len(fs), func(i int) bool { 305 return fs[i].Name >= prefix 306 }) 307 if n >= len(fs) { 308 return -1 309 } 310 if !strings.HasPrefix(fs[n].Name, prefix) { 311 return -1 312 } 313 return n 314 } 315 316 // Lookup returns the specified compilation from the archive, if it exists. If 317 // the requested digest is not in the archive, ErrDigestNotFound is returned. 318 func (r *Reader) Lookup(unitDigest string) (*Unit, error) { 319 needle := r.unitPath(unitDigest) 320 pos := r.firstIndex(needle) 321 if pos >= 0 { 322 if f := r.zip.File[pos]; f.Name == needle { 323 return r.readUnit(unitDigest, f) 324 } 325 } 326 return nil, ErrDigestNotFound 327 } 328 329 // A ScanOption configures the behavior of scanning a kzip file. 330 type ScanOption interface{ isScanOption() } 331 332 type readConcurrency int 333 334 func (readConcurrency) isScanOption() {} 335 336 // ReadConcurrency returns a ScanOption that configures the max concurrency of 337 // reading compilation units within a kzip archive. 338 func ReadConcurrency(n int) ScanOption { 339 return readConcurrency(n) 340 } 341 342 func (r *Reader) canonicalUnits() (string, []*zip.File) { 343 prefix := r.unitPath("") + "/" 344 pos := r.firstIndex(prefix) 345 if pos < 0 { 346 return "", nil 347 } 348 var res []*zip.File 349 for _, file := range r.zip.File[pos:] { 350 if !strings.HasPrefix(file.Name, prefix) { 351 break 352 } 353 if file.Name == prefix { 354 continue // tolerate an empty units directory entry 355 } 356 res = append(res, file) 357 358 } 359 return prefix, res 360 } 361 362 // Scan scans all the compilations stored in the archive, and invokes f for 363 // each compilation record. If f reports an error, the scan is terminated and 364 // that error is propagated to the caller of Scan. At most 1 invocation of f 365 // will occur at any one time. 366 func (r *Reader) Scan(f func(*Unit) error, opts ...ScanOption) error { 367 concurrency := 1 368 for _, opt := range opts { 369 switch opt := opt.(type) { 370 case readConcurrency: 371 if n := int(opt); n > 0 { 372 concurrency = n 373 } 374 default: 375 return fmt.Errorf("unknown ScanOption type: %T", opt) 376 } 377 } 378 379 prefix, fileUnits := r.canonicalUnits() 380 if len(fileUnits) == 0 { 381 return nil 382 } 383 384 ctx, cancel := context.WithCancel(context.Background()) 385 defer cancel() 386 g, ctx := errgroup.WithContext(ctx) 387 388 files := make(chan *zip.File) 389 390 g.Go(func() error { 391 defer close(files) 392 for _, file := range fileUnits { 393 select { 394 case <-ctx.Done(): 395 return nil 396 case files <- file: 397 } 398 } 399 return nil 400 }) 401 units := make(chan *Unit) 402 var wg sync.WaitGroup 403 for i := 0; i < concurrency; i++ { 404 wg.Add(1) 405 g.Go(func() error { 406 defer wg.Done() 407 for file := range files { 408 digest := strings.TrimPrefix(file.Name, prefix) 409 unit, err := r.readUnit(digest, file) 410 if err != nil { 411 return err 412 } 413 select { 414 case <-ctx.Done(): 415 return nil 416 case units <- unit: 417 } 418 } 419 return nil 420 }) 421 } 422 go func() { wg.Wait(); close(units) }() 423 for unit := range units { 424 select { 425 case <-ctx.Done(): 426 return g.Wait() 427 default: 428 if err := f(unit); err != nil { 429 return err 430 } 431 } 432 } 433 return g.Wait() 434 } 435 436 // Open opens a reader on the contents of the specified file digest. If the 437 // requested digest is not in the archive, ErrDigestNotFound is returned. The 438 // caller must close the reader when it is no longer needed. 439 func (r *Reader) Open(fileDigest string) (io.ReadCloser, error) { 440 needle := r.filePath(fileDigest) 441 if pos := r.firstIndex(needle); pos >= 0 { 442 if f := r.zip.File[pos]; f.Name == needle { 443 return f.Open() 444 } 445 } 446 return nil, ErrDigestNotFound 447 } 448 449 // ReadAll returns the complete contents of the file with the specified digest. 450 // It is a convenience wrapper for Open followed by ioutil.ReadAll. 451 func (r *Reader) ReadAll(fileDigest string) ([]byte, error) { 452 f, err := r.Open(fileDigest) 453 if err == nil { 454 defer f.Close() 455 return ioutil.ReadAll(f) 456 } 457 return nil, err 458 } 459 460 // A Unit represents a compilation record read from a kzip archive. 461 type Unit struct { 462 Digest string 463 Proto *apb.CompilationUnit 464 Index *apb.IndexedCompilation_Index 465 } 466 467 // A Writer permits construction of a .kzip archive. 468 type Writer struct { 469 mu sync.Mutex 470 zip *zip.Writer 471 fd stringset.Set // file digests already written 472 ud stringset.Set // unit digests already written 473 c io.Closer // a closer for the underlying writer (may be nil) 474 475 encoding Encoding // What encoding to use 476 } 477 478 // WriterOption describes options when creating a Writer 479 type WriterOption func(*Writer) 480 481 // WithEncoding sets the encoding to be used by a Writer 482 func WithEncoding(e Encoding) WriterOption { 483 return func(w *Writer) { 484 w.encoding = e 485 } 486 } 487 488 // NewWriter constructs a new empty Writer that delivers output to w. The 489 // AddUnit and AddFile methods are safe for use by concurrent goroutines. 490 func NewWriter(w io.Writer, options ...WriterOption) (*Writer, error) { 491 archive := zip.NewWriter(w) 492 // Create an entry for the root directory, which must be first. 493 root := &zip.FileHeader{ 494 Name: "root/", 495 Comment: "kzip root directory", 496 Modified: modifiedTime, 497 } 498 root.SetMode(os.ModeDir | 0755) 499 if _, err := archive.CreateHeader(root); err != nil { 500 return nil, err 501 } 502 archive.SetComment("Kythe kzip archive") 503 504 kw := &Writer{ 505 zip: archive, 506 fd: stringset.New(), 507 ud: stringset.New(), 508 encoding: DefaultEncoding(), 509 } 510 for _, opt := range options { 511 opt(kw) 512 } 513 return kw, nil 514 } 515 516 // NewWriteCloser behaves as NewWriter, but arranges that when the *Writer is 517 // closed it also closes wc. 518 func NewWriteCloser(wc io.WriteCloser, options ...WriterOption) (*Writer, error) { 519 w, err := NewWriter(wc, options...) 520 if err == nil { 521 w.c = wc 522 } 523 return w, err 524 } 525 526 // toJSON defines the encoding format for compilation messages. 527 var toJSON = &protojson.MarshalOptions{UseProtoNames: true} 528 529 // AddUnit adds a new compilation record to be added to the archive, returning 530 // the hex-encoded SHA256 digest of the unit's contents. It is legal for index 531 // to be nil, in which case no index terms will be added. 532 // 533 // If the same compilation is added multiple times, AddUnit returns the digest 534 // of the duplicated compilation along with ErrUnitExists to all callers after 535 // the first. The existing unit is not modified. 536 func (w *Writer) AddUnit(cu *apb.CompilationUnit, index *apb.IndexedCompilation_Index) (string, error) { 537 unit := kythe.Unit{Proto: cu} 538 unit.Canonicalize() 539 digest := unit.Digest() 540 541 w.mu.Lock() 542 defer w.mu.Unlock() 543 if w.ud.Contains(digest) { 544 return digest, ErrUnitExists 545 } 546 547 if w.encoding&EncodingJSON != 0 { 548 f, err := w.zip.CreateHeader(newFileHeader("root", prefixJSON, digest)) 549 if err != nil { 550 return "", err 551 } 552 rec, err := toJSON.Marshal(&apb.IndexedCompilation{ 553 Unit: unit.Proto, 554 Index: index, 555 }) 556 if err != nil { 557 return "", err 558 } 559 if _, err := f.Write(rec); err != nil { 560 return "", err 561 } 562 } 563 if w.encoding&EncodingProto != 0 { 564 f, err := w.zip.CreateHeader(newFileHeader("root", prefixProto, digest)) 565 if err != nil { 566 return "", err 567 } 568 rec, err := proto.Marshal(&apb.IndexedCompilation{ 569 Unit: unit.Proto, 570 Index: index, 571 }) 572 if err != nil { 573 return "", err 574 } 575 _, err = f.Write(rec) 576 if err != nil { 577 return "", err 578 } 579 } 580 w.ud.Add(digest) 581 return digest, nil 582 } 583 584 // AddFile copies the complete contents of r into the archive as a new file 585 // entry, returning the hex-encoded SHA256 digest of the file's contents. 586 func (w *Writer) AddFile(r io.Reader) (string, error) { 587 // Buffer the file contents and compute their digest. 588 // We have to do this ahead of time, because we have to provide the name of 589 // the file before we can start writing its contents. 590 var buf bytes.Buffer 591 hash := sha256.New() 592 if _, err := io.Copy(io.MultiWriter(hash, &buf), r); err != nil { 593 return "", err 594 } 595 digest := hex.EncodeToString(hash.Sum(nil)) 596 597 w.mu.Lock() 598 defer w.mu.Unlock() 599 if w.fd.Contains(digest) { 600 return digest, nil // already written 601 } 602 603 f, err := w.zip.CreateHeader(newFileHeader("root", "files", digest)) 604 if err != nil { 605 return "", err 606 } 607 if _, err := io.Copy(f, &buf); err != nil { 608 return "", err 609 } 610 w.fd.Add(digest) 611 return digest, nil 612 } 613 614 // Close closes the writer, flushing any remaining unwritten data out to the 615 // underlying zip file. It is safe to close w arbitrarily many times; all calls 616 // after the first will report nil. 617 func (w *Writer) Close() error { 618 w.mu.Lock() 619 defer w.mu.Unlock() 620 if w.zip != nil { 621 err := w.zip.Close() 622 w.zip = nil 623 if w.c != nil { 624 if cerr := w.c.Close(); err == nil { 625 return cerr 626 } 627 } 628 return err 629 } 630 return nil 631 } 632 633 func newFileHeader(parts ...string) *zip.FileHeader { 634 fh := &zip.FileHeader{Name: path.Join(parts...), Method: zip.Deflate} 635 fh.SetMode(0600) 636 fh.Modified = modifiedTime 637 return fh 638 } 639 640 // Scan is a convenience function that creates a *Reader from f and invokes its 641 // Scan method with the given callback. Each invocation of scan is passed the 642 // reader associated with f, along with the current compilation unit. 643 func Scan(f File, scan func(*Reader, *Unit) error, opts ...ScanOption) error { 644 size, err := f.Seek(0, io.SeekEnd) 645 if err != nil { 646 return fmt.Errorf("getting file size: %v", err) 647 } 648 r, err := NewReader(f, size) 649 if err != nil { 650 return err 651 } 652 return r.Scan(func(unit *Unit) error { 653 return scan(r, unit) 654 }, opts...) 655 } 656 657 // A File represents the file capabilities needed to scan a kzip file. 658 type File interface { 659 io.ReaderAt 660 io.Seeker 661 } 662 663 // FileData creates a file data protobuf message by fully reading the contents 664 // of r, having the designated path. 665 func FileData(path string, r io.Reader) (*apb.FileData, error) { 666 var buf bytes.Buffer 667 hash := sha256.New() 668 669 w := io.MultiWriter(&buf, hash) 670 if _, err := io.Copy(w, r); err != nil { 671 return nil, err 672 } 673 digest := hex.EncodeToString(hash.Sum(nil)) 674 return &apb.FileData{ 675 Content: buf.Bytes(), 676 Info: &apb.FileInfo{ 677 Path: path, 678 Digest: digest, 679 }, 680 }, nil 681 } 682 683 // Fetch implements the analysis.Fetcher interface for files attached to c. 684 // If digest == "", files are matched by path only. 685 func (c *Compilation) Fetch(path, digest string) ([]byte, error) { 686 for _, f := range c.Files { 687 info := f.GetInfo() 688 fp := info.Path 689 fd := info.Digest 690 if path == fp && (digest == "" || digest == fd) { 691 return f.Content, nil 692 } 693 if digest != "" && digest == fd { 694 return f.Content, nil 695 } 696 } 697 return nil, os.ErrNotExist 698 } 699 700 // Unit returns the CompilationUnit associated with c, creating a new empty one 701 // if necessary. 702 func (c *Compilation) Unit() *apb.CompilationUnit { 703 if c.Proto == nil { 704 c.Proto = new(apb.CompilationUnit) 705 } 706 return c.Proto 707 } 708 709 // AddFile adds an input file to the compilation by fully reading r. The file 710 // is added to the required inputs, attributed to the designated path, and also 711 // to the file data slice. If v != nil it is used as the vname of the input 712 // added. 713 func (c *Compilation) AddFile(path string, r io.Reader, v *spb.VName, details ...proto.Message) error { 714 var anys []*ptypes.Any 715 for _, d := range details { 716 any, err := ptypes.MarshalAny(d) 717 if err != nil { 718 return fmt.Errorf("unable to marshal %T to Any: %v", d, err) 719 } 720 anys = append(anys, any) 721 } 722 fd, err := FileData(path, r) 723 if err != nil { 724 return err 725 } 726 c.Files = append(c.Files, fd) 727 unit := c.Unit() 728 unit.RequiredInput = append(unit.RequiredInput, &apb.CompilationUnit_FileInput{ 729 VName: v, 730 Info: fd.Info, 731 Details: anys, 732 }) 733 return nil 734 } 735 736 // AddDetails adds the specified details message to the compilation. 737 func (c *Compilation) AddDetails(msg proto.Message) error { 738 details, err := ptypes.MarshalAny(msg) 739 if err != nil { 740 return err 741 } 742 unit := c.Unit() 743 unit.Details = append(unit.Details, details) 744 return nil 745 }