go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cipd/appengine/impl/repo/processing/extractor.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package processing 16 17 import ( 18 "context" 19 "hash" 20 "io" 21 "net/http" 22 23 "go.chromium.org/luci/common/errors" 24 "go.chromium.org/luci/common/logging" 25 "go.chromium.org/luci/common/retry/transient" 26 "go.chromium.org/luci/server/auth" 27 28 "go.chromium.org/luci/cipd/appengine/impl/cas" 29 "go.chromium.org/luci/cipd/appengine/impl/gs" 30 "go.chromium.org/luci/cipd/common" 31 32 api "go.chromium.org/luci/cipd/api/cipd/v1" 33 ) 34 35 // Extractor can extract files from the package, writing them to the CAS. 36 type Extractor struct { 37 // Reader is an already open package file. 38 Reader *PackageReader 39 40 // CAS is the destination CAS implementation. 41 CAS cas.StorageServer 42 43 // PrimaryHash is the hash algorithm to use to name the file in the CAS. 44 PrimaryHash api.HashAlgo 45 46 // AlternativeHashes is a list of hashes to calculate in addition to 47 // the PrimaryHash. 48 AlternativeHashes []api.HashAlgo 49 50 // Uploader returns io.Writer that uploads to the given destination URL. 51 // 52 // If nil, will use a Google Storage uploader. Useful in tests. 53 Uploader func(ctx context.Context, size int64, uploadURL string) io.Writer 54 55 // BufferSize is size of the buffer for GS uploads (default is 2 Mb). 56 BufferSize int 57 } 58 59 // ExtractionResult is a result of a successful file extraction. 60 type ExtractionResult struct { 61 Path string // the file path passed to Run 62 Ref *api.ObjectRef // reference to the extracted file in the CAS 63 Size int64 // the size of the file in bytes 64 Hashes map[api.HashAlgo]hash.Hash // all calculated hashes 65 } 66 67 // Run extracts a single file from the package. 68 func (ex *Extractor) Run(ctx context.Context, path string) (*ExtractionResult, error) { 69 // Collect a map with all output hashes. 70 hashes := make(map[api.HashAlgo]hash.Hash, len(ex.AlternativeHashes)+1) 71 for _, algo := range ex.AlternativeHashes { 72 hashes[algo] = common.MustNewHash(algo) 73 } 74 if hashes[ex.PrimaryHash] == nil { 75 hashes[ex.PrimaryHash] = common.MustNewHash(ex.PrimaryHash) 76 } 77 78 // Start reading the file. 79 reader, size, err := ex.Reader.Open(path) 80 if err != nil { 81 return nil, errors.Annotate(err, "failed to open the file for reading").Err() 82 } 83 defer reader.Close() // we don't care about errors here 84 85 // Start writing the result to CAS. 86 op, err := ex.CAS.BeginUpload(ctx, &api.BeginUploadRequest{ 87 HashAlgo: ex.PrimaryHash, 88 }) 89 if err != nil { 90 return nil, errors.Annotate(err, "failed to open a CAS upload").Tag(transient.Tag).Err() 91 } 92 93 // Grab an io.Writer that uploads to Google Storage. 94 factory := ex.Uploader 95 if factory == nil { 96 factory = gsUploader 97 } 98 uploader := factory(ctx, size, op.UploadUrl) 99 100 // Copy in 2 Mb chunks by default. 101 bufferSize := ex.BufferSize 102 if bufferSize == 0 { 103 bufferSize = 2 * 1024 * 1024 104 } 105 106 // Copy, calculating digests on the fly. 107 // 108 // We use fullReader to make sure we write full 2 Mb chunks to GS. Otherwise 109 // 'reader' uses 32 Kb buffers and they are flushed as 32 Kb buffers to Google 110 // Storage too (which doesn't work). Remember, in Go an io.Reader can choose 111 // to read less than asked and zip readers use 32 Kb buffers. CopyBuffer just 112 // sends them to the writer right away. 113 // 114 // Note that reads from Google Storage are already properly buffered by 115 // PackageReader implementation, so it's OK if the zip reader reads small 116 // chunks from the underlying file reader. We basically read 512 Kb buffer 117 // from GS, then unzip it in memory via small 32 Kb chunks into 2 Mb output 118 // buffer, and then flush it to GS. 119 writeTo := make([]io.Writer, 0, 1+len(hashes)) 120 writeTo = append(writeTo, uploader) 121 for _, hash := range hashes { 122 writeTo = append(writeTo, hash) 123 } 124 copied, err := io.CopyBuffer( 125 io.MultiWriter(writeTo...), 126 fullReader{reader}, 127 make([]byte, bufferSize)) 128 if err == nil && copied != size { 129 err = errors.Reason("unexpected file size: expecting %d bytes, read %d bytes", size, copied).Err() 130 } 131 132 // If asked to rewind to a faraway offset (should be rare), just restart the 133 // whole process from scratch by returning a transient error. 134 if _, ok := err.(*gs.RestartUploadError); ok { 135 err = errors.Annotate(err, "asked to restart the upload from faraway offset").Tag(transient.Tag).Err() 136 } 137 138 if err != nil { 139 // Best effort cleanup of the upload session. It's not a big deal if this 140 // fails and the upload stays as garbage. 141 _, cancelErr := ex.CAS.CancelUpload(ctx, &api.CancelUploadRequest{ 142 UploadOperationId: op.OperationId, 143 }) 144 if cancelErr != nil { 145 logging.Errorf(ctx, "Failed to cancel the upload: %s", cancelErr) 146 } 147 return nil, err 148 } 149 150 // Skip the hash calculation in CAS by enforcing the hash, we've just 151 // calculated it. 152 extractedRef := &api.ObjectRef{ 153 HashAlgo: ex.PrimaryHash, 154 HexDigest: common.HexDigest(hashes[ex.PrimaryHash]), 155 } 156 op, err = ex.CAS.FinishUpload(ctx, &api.FinishUploadRequest{ 157 UploadOperationId: op.OperationId, 158 ForceHash: extractedRef, 159 }) 160 161 // CAS should publish the object right away. 162 switch { 163 case err != nil: 164 return nil, errors.Annotate(err, "failed to finalize the CAS upload").Tag(transient.Tag).Err() 165 case op.Status != api.UploadStatus_PUBLISHED: 166 return nil, errors.Reason("unexpected upload status from CAS %s: %s", op.Status, op.ErrorMessage).Err() 167 } 168 169 // Success! 170 return &ExtractionResult{ 171 Path: path, 172 Ref: extractedRef, 173 Size: size, 174 Hashes: hashes, 175 }, nil 176 } 177 178 //////////////////////////////////////////////////////////////////////////////// 179 180 func gsUploader(ctx context.Context, size int64, uploadURL string) io.Writer { 181 // Authentication is handled through the tokens in the upload session URL. 182 tr, err := auth.GetRPCTransport(ctx, auth.NoAuth) 183 if err != nil { 184 panic(errors.Annotate(err, "failed to get the RPC transport").Err()) 185 } 186 return &gs.Uploader{ 187 Context: ctx, 188 Client: &http.Client{Transport: tr}, 189 UploadURL: uploadURL, 190 FileSize: size, 191 } 192 } 193 194 // fullReader is io.Reader that fills the buffer completely using the data from 195 // the underlying reader. 196 type fullReader struct { 197 r io.ReadCloser 198 } 199 200 func (r fullReader) Read(buf []byte) (n int, err error) { 201 n, err = io.ReadFull(r.r, buf) 202 if err == io.ErrUnexpectedEOF { 203 err = nil // this is fine, we are just reading the last chunk 204 } 205 return 206 }