go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cipd/appengine/impl/repo/processing/extractor.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cipd/appengine/impl/repo/processing/extractor.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package processing
    16  
    17  import (
    18  	"context"
    19  	"hash"
    20  	"io"
    21  	"net/http"
    22  
    23  	"go.chromium.org/luci/common/errors"
    24  	"go.chromium.org/luci/common/logging"
    25  	"go.chromium.org/luci/common/retry/transient"
    26  	"go.chromium.org/luci/server/auth"
    27  
    28  	"go.chromium.org/luci/cipd/appengine/impl/cas"
    29  	"go.chromium.org/luci/cipd/appengine/impl/gs"
    30  	"go.chromium.org/luci/cipd/common"
    31  
    32  	api "go.chromium.org/luci/cipd/api/cipd/v1"
    33  )
    34  
    35  // Extractor can extract files from the package, writing them to the CAS.
    36  type Extractor struct {
    37  	// Reader is an already open package file.
    38  	Reader *PackageReader
    39  
    40  	// CAS is the destination CAS implementation.
    41  	CAS cas.StorageServer
    42  
    43  	// PrimaryHash is the hash algorithm to use to name the file in the CAS.
    44  	PrimaryHash api.HashAlgo
    45  
    46  	// AlternativeHashes is a list of hashes to calculate in addition to
    47  	// the PrimaryHash.
    48  	AlternativeHashes []api.HashAlgo
    49  
    50  	// Uploader returns io.Writer that uploads to the given destination URL.
    51  	//
    52  	// If nil, will use a Google Storage uploader. Useful in tests.
    53  	Uploader func(ctx context.Context, size int64, uploadURL string) io.Writer
    54  
    55  	// BufferSize is size of the buffer for GS uploads (default is 2 Mb).
    56  	BufferSize int
    57  }
    58  
    59  // ExtractionResult is a result of a successful file extraction.
    60  type ExtractionResult struct {
    61  	Path   string                     // the file path passed to Run
    62  	Ref    *api.ObjectRef             // reference to the extracted file in the CAS
    63  	Size   int64                      // the size of the file in bytes
    64  	Hashes map[api.HashAlgo]hash.Hash // all calculated hashes
    65  }
    66  
    67  // Run extracts a single file from the package.
    68  func (ex *Extractor) Run(ctx context.Context, path string) (*ExtractionResult, error) {
    69  	// Collect a map with all output hashes.
    70  	hashes := make(map[api.HashAlgo]hash.Hash, len(ex.AlternativeHashes)+1)
    71  	for _, algo := range ex.AlternativeHashes {
    72  		hashes[algo] = common.MustNewHash(algo)
    73  	}
    74  	if hashes[ex.PrimaryHash] == nil {
    75  		hashes[ex.PrimaryHash] = common.MustNewHash(ex.PrimaryHash)
    76  	}
    77  
    78  	// Start reading the file.
    79  	reader, size, err := ex.Reader.Open(path)
    80  	if err != nil {
    81  		return nil, errors.Annotate(err, "failed to open the file for reading").Err()
    82  	}
    83  	defer reader.Close() // we don't care about errors here
    84  
    85  	// Start writing the result to CAS.
    86  	op, err := ex.CAS.BeginUpload(ctx, &api.BeginUploadRequest{
    87  		HashAlgo: ex.PrimaryHash,
    88  	})
    89  	if err != nil {
    90  		return nil, errors.Annotate(err, "failed to open a CAS upload").Tag(transient.Tag).Err()
    91  	}
    92  
    93  	// Grab an io.Writer that uploads to Google Storage.
    94  	factory := ex.Uploader
    95  	if factory == nil {
    96  		factory = gsUploader
    97  	}
    98  	uploader := factory(ctx, size, op.UploadUrl)
    99  
   100  	// Copy in 2 Mb chunks by default.
   101  	bufferSize := ex.BufferSize
   102  	if bufferSize == 0 {
   103  		bufferSize = 2 * 1024 * 1024
   104  	}
   105  
   106  	// Copy, calculating digests on the fly.
   107  	//
   108  	// We use fullReader to make sure we write full 2 Mb chunks to GS. Otherwise
   109  	// 'reader' uses 32 Kb buffers and they are flushed as 32 Kb buffers to Google
   110  	// Storage too (which doesn't work). Remember, in Go an io.Reader can choose
   111  	// to read less than asked and zip readers use 32 Kb buffers. CopyBuffer just
   112  	// sends them to the writer right away.
   113  	//
   114  	// Note that reads from Google Storage are already properly buffered by
   115  	// PackageReader implementation, so it's OK if the zip reader reads small
   116  	// chunks from the underlying file reader. We basically read 512 Kb buffer
   117  	// from GS, then unzip it in memory via small 32 Kb chunks into 2 Mb output
   118  	// buffer, and then flush it to GS.
   119  	writeTo := make([]io.Writer, 0, 1+len(hashes))
   120  	writeTo = append(writeTo, uploader)
   121  	for _, hash := range hashes {
   122  		writeTo = append(writeTo, hash)
   123  	}
   124  	copied, err := io.CopyBuffer(
   125  		io.MultiWriter(writeTo...),
   126  		fullReader{reader},
   127  		make([]byte, bufferSize))
   128  	if err == nil && copied != size {
   129  		err = errors.Reason("unexpected file size: expecting %d bytes, read %d bytes", size, copied).Err()
   130  	}
   131  
   132  	// If asked to rewind to a faraway offset (should be rare), just restart the
   133  	// whole process from scratch by returning a transient error.
   134  	if _, ok := err.(*gs.RestartUploadError); ok {
   135  		err = errors.Annotate(err, "asked to restart the upload from faraway offset").Tag(transient.Tag).Err()
   136  	}
   137  
   138  	if err != nil {
   139  		// Best effort cleanup of the upload session. It's not a big deal if this
   140  		// fails and the upload stays as garbage.
   141  		_, cancelErr := ex.CAS.CancelUpload(ctx, &api.CancelUploadRequest{
   142  			UploadOperationId: op.OperationId,
   143  		})
   144  		if cancelErr != nil {
   145  			logging.Errorf(ctx, "Failed to cancel the upload: %s", cancelErr)
   146  		}
   147  		return nil, err
   148  	}
   149  
   150  	// Skip the hash calculation in CAS by enforcing the hash, we've just
   151  	// calculated it.
   152  	extractedRef := &api.ObjectRef{
   153  		HashAlgo:  ex.PrimaryHash,
   154  		HexDigest: common.HexDigest(hashes[ex.PrimaryHash]),
   155  	}
   156  	op, err = ex.CAS.FinishUpload(ctx, &api.FinishUploadRequest{
   157  		UploadOperationId: op.OperationId,
   158  		ForceHash:         extractedRef,
   159  	})
   160  
   161  	// CAS should publish the object right away.
   162  	switch {
   163  	case err != nil:
   164  		return nil, errors.Annotate(err, "failed to finalize the CAS upload").Tag(transient.Tag).Err()
   165  	case op.Status != api.UploadStatus_PUBLISHED:
   166  		return nil, errors.Reason("unexpected upload status from CAS %s: %s", op.Status, op.ErrorMessage).Err()
   167  	}
   168  
   169  	// Success!
   170  	return &ExtractionResult{
   171  		Path:   path,
   172  		Ref:    extractedRef,
   173  		Size:   size,
   174  		Hashes: hashes,
   175  	}, nil
   176  }
   177  
   178  ////////////////////////////////////////////////////////////////////////////////
   179  
   180  func gsUploader(ctx context.Context, size int64, uploadURL string) io.Writer {
   181  	// Authentication is handled through the tokens in the upload session URL.
   182  	tr, err := auth.GetRPCTransport(ctx, auth.NoAuth)
   183  	if err != nil {
   184  		panic(errors.Annotate(err, "failed to get the RPC transport").Err())
   185  	}
   186  	return &gs.Uploader{
   187  		Context:   ctx,
   188  		Client:    &http.Client{Transport: tr},
   189  		UploadURL: uploadURL,
   190  		FileSize:  size,
   191  	}
   192  }
   193  
   194  // fullReader is io.Reader that fills the buffer completely using the data from
   195  // the underlying reader.
   196  type fullReader struct {
   197  	r io.ReadCloser
   198  }
   199  
   200  func (r fullReader) Read(buf []byte) (n int, err error) {
   201  	n, err = io.ReadFull(r.r, buf)
   202  	if err == io.ErrUnexpectedEOF {
   203  		err = nil // this is fine, we are just reading the last chunk
   204  	}
   205  	return
   206  }