github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/api/helpers/upload.go (about) 1 // Package helpers provide useful wrappers for clients using the lakeFS OpenAPI. 2 package helpers 3 4 import ( 5 "context" 6 "encoding/base64" 7 "encoding/hex" 8 "errors" 9 "fmt" 10 "io" 11 "mime" 12 "mime/multipart" 13 "net/http" 14 "net/textproto" 15 "net/url" 16 "path/filepath" 17 "strings" 18 19 "github.com/go-openapi/swag" 20 "github.com/treeverse/lakefs/pkg/api/apigen" 21 "github.com/treeverse/lakefs/pkg/api/apiutil" 22 "github.com/treeverse/lakefs/pkg/block/azure" 23 "github.com/treeverse/lakefs/pkg/httputil" 24 "golang.org/x/sync/errgroup" 25 ) 26 27 // MaxUploadParts is the maximum allowed number of parts in a multipart upload 28 const MaxUploadParts int32 = 10000 29 30 // MinUploadPartSize is the minimum allowed part size when uploading a part 31 const MinUploadPartSize int64 = 1024 * 1024 * 5 32 33 // DefaultUploadPartSize is the default part size to buffer chunks of a payload into 34 const DefaultUploadPartSize = MinUploadPartSize 35 36 // DefaultUploadConcurrency is the default number of goroutines to spin up when uploading a multipart upload 37 const DefaultUploadConcurrency = 5 38 39 // ClientUpload uploads contents as a file via lakeFS 40 func ClientUpload(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID, branchID, objPath string, metadata map[string]string, contentType string, contents io.ReadSeeker) (*apigen.ObjectStats, error) { 41 pr, pw := io.Pipe() 42 defer func() { 43 _ = pr.Close() 44 }() 45 46 mpw := multipart.NewWriter(pw) 47 mpContentType := mpw.FormDataContentType() 48 go func() { 49 defer func() { 50 _ = mpw.Close() 51 _ = pw.Close() 52 }() 53 54 filename := filepath.Base(objPath) 55 const fieldName = "content" 56 var err error 57 var cw io.Writer 58 // when no content-type is specified, we let 'CreateFromFile' add the part with the default content type. 59 // otherwise, we add a part and set the content-type. 60 if contentType != "" { 61 h := make(textproto.MIMEHeader) 62 contentDisposition := mime.FormatMediaType("form-data", map[string]string{"name": fieldName, "filename": filename}) 63 h.Set("Content-Disposition", contentDisposition) 64 h.Set("Content-Type", contentType) 65 cw, err = mpw.CreatePart(h) 66 } else { 67 cw, err = mpw.CreateFormFile(fieldName, filename) 68 } 69 if err != nil { 70 _ = pw.CloseWithError(err) 71 return 72 } 73 if _, err := io.Copy(cw, contents); err != nil { 74 _ = pw.CloseWithError(err) 75 return 76 } 77 }() 78 79 resp, err := client.UploadObjectWithBodyWithResponse(ctx, repoID, branchID, &apigen.UploadObjectParams{ 80 Path: objPath, 81 }, mpContentType, pr, func(ctx context.Context, req *http.Request) error { 82 var metaKey string 83 for k, v := range metadata { 84 lowerKey := strings.ToLower(k) 85 if strings.HasPrefix(lowerKey, apiutil.LakeFSMetadataPrefix) { 86 metaKey = apiutil.LakeFSHeaderInternalPrefix + lowerKey[len(apiutil.LakeFSMetadataPrefix):] 87 } else { 88 metaKey = apiutil.LakeFSHeaderMetadataPrefix + lowerKey 89 } 90 req.Header.Set(metaKey, v) 91 } 92 return nil 93 }) 94 if err != nil { 95 return nil, err 96 } 97 if resp.JSON201 == nil { 98 return nil, ResponseAsError(resp) 99 } 100 return resp.JSON201, nil 101 } 102 103 // PreSignUploader uploads contents as a file via lakeFS using presigned urls 104 // It supports both multipart and single part uploads. 105 type PreSignUploader struct { 106 Concurrency int 107 HTTPClient *http.Client 108 Client apigen.ClientWithResponsesInterface 109 MultipartSupport bool 110 } 111 112 // presignUpload represents a single upload request 113 type presignUpload struct { 114 uploader PreSignUploader 115 repoID string 116 branchID string 117 objectPath string 118 metadata map[string]string 119 contentType string 120 reader io.ReadSeeker 121 readerAt io.ReaderAt 122 size int64 123 partSize int64 124 numParts int 125 } 126 127 func NewPreSignUploader(client apigen.ClientWithResponsesInterface, multipartSupport bool) *PreSignUploader { 128 return &PreSignUploader{ 129 Concurrency: DefaultUploadConcurrency, 130 HTTPClient: http.DefaultClient, 131 Client: client, 132 MultipartSupport: multipartSupport, 133 } 134 } 135 136 func (u *PreSignUploader) Upload(ctx context.Context, repoID string, branchID string, objPath string, content io.ReadSeeker, 137 contentType string, metadata map[string]string, 138 ) (*apigen.ObjectStats, error) { 139 // calculate size and rewind content 140 size, err := content.Seek(0, io.SeekEnd) 141 if err != nil { 142 return nil, err 143 } 144 if _, seekErr := content.Seek(0, io.SeekStart); seekErr != nil { 145 return nil, seekErr 146 } 147 // check if content implements io.ReaderAt for multipart upload 148 readerAt, _ := content.(io.ReaderAt) 149 150 // create upload object - represents a single upload request 151 upload := &presignUpload{ 152 uploader: *u, 153 repoID: repoID, 154 branchID: branchID, 155 objectPath: objPath, 156 metadata: metadata, 157 contentType: contentType, 158 reader: content, 159 readerAt: readerAt, 160 size: size, 161 } 162 return upload.Upload(ctx) 163 } 164 165 type presignPartReader struct { 166 Reader *io.SectionReader 167 URL string 168 } 169 170 func (u *presignUpload) uploadMultipart(ctx context.Context) (*apigen.ObjectStats, error) { 171 mpu, err := u.initMultipart(ctx) 172 if err != nil { 173 return nil, err 174 } 175 176 // prepare readers 177 parts := make([]presignPartReader, u.numParts) 178 rem := u.size 179 off := int64(0) 180 for i := range parts { 181 size := min(u.partSize, rem) 182 parts[i].Reader = io.NewSectionReader(u.readerAt, off, size) // use `readerAt` 183 parts[i].URL = (*mpu.PresignedUrls)[i] 184 rem -= size 185 off += size 186 } 187 188 // upload parts in parallel, fill uploadParts array with results 189 uploadParts := make([]apigen.UploadPart, u.numParts) 190 g, grpCtx := errgroup.WithContext(context.Background()) 191 g.SetLimit(u.uploader.Concurrency) 192 193 for i := 0; i < u.numParts; i++ { 194 i := i // pinning 195 g.Go(func() error { 196 etag, err := u.uploadPart(grpCtx, parts[i].Reader, parts[i].URL) 197 if err != nil { 198 return fmt.Errorf("part %d %w", i+1, err) 199 } 200 uploadParts[i] = apigen.UploadPart{ 201 PartNumber: i + 1, 202 Etag: etag, 203 } 204 return nil 205 }) 206 } 207 208 // wait for all parts to be uploaded 209 if err := g.Wait(); err != nil { 210 // abort upload using a new context to avoid context cancellation 211 abortErr := u.abortMultipart(context.Background(), mpu) 212 if abortErr != nil { 213 err = errors.Join(err, abortErr) 214 } 215 return nil, err 216 } 217 218 return u.completeMultipart(ctx, mpu, uploadParts) 219 } 220 221 func (u *presignUpload) completeMultipart(ctx context.Context, mpu *apigen.PresignMultipartUpload, uploadParts []apigen.UploadPart) (*apigen.ObjectStats, error) { 222 resp, err := u.uploader.Client.CompletePresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, mpu.UploadId, 223 &apigen.CompletePresignMultipartUploadParams{ 224 Path: u.objectPath, 225 }, 226 apigen.CompletePresignMultipartUploadJSONRequestBody{ 227 Parts: uploadParts, 228 PhysicalAddress: mpu.PhysicalAddress, 229 UserMetadata: &apigen.CompletePresignMultipartUpload_UserMetadata{ 230 AdditionalProperties: u.metadata, 231 }, 232 ContentType: apiutil.Ptr(u.contentType), 233 }, 234 ) 235 if err != nil { 236 return nil, fmt.Errorf("complete presign multipart upload: %w", err) 237 } 238 if resp.JSON409 != nil { 239 return nil, ErrConflict 240 } 241 if resp.JSON200 == nil { 242 return nil, fmt.Errorf("complete presign multipart upload: %w", ResponseAsError(resp)) 243 } 244 return resp.JSON200, nil 245 } 246 247 func (u *presignUpload) abortMultipart(ctx context.Context, mpu *apigen.PresignMultipartUpload) error { 248 resp, err := u.uploader.Client.AbortPresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, mpu.UploadId, 249 &apigen.AbortPresignMultipartUploadParams{ 250 Path: u.objectPath, 251 }, 252 apigen.AbortPresignMultipartUploadJSONRequestBody{ 253 PhysicalAddress: mpu.PhysicalAddress, 254 }) 255 if err != nil { 256 return fmt.Errorf("abort presign multipart upload: %w", err) 257 } 258 if resp.StatusCode() != http.StatusNoContent { 259 return fmt.Errorf("abort presign multipart upload: %w", ResponseAsError(resp)) 260 } 261 return nil 262 } 263 264 func (u *presignUpload) initMultipart(ctx context.Context) (*apigen.PresignMultipartUpload, error) { 265 // adjust part size 266 u.partSize = DefaultUploadPartSize 267 if u.size/u.partSize >= int64(MaxUploadParts) { 268 // Add one to the part size to account for remainders 269 u.partSize = (u.size / int64(MaxUploadParts)) + 1 270 } 271 272 // calculate the number of parts 273 u.numParts = int(u.size / u.partSize) 274 if u.size%u.partSize != 0 { 275 u.numParts++ 276 } 277 278 // create presign multipart upload 279 resp, err := u.uploader.Client.CreatePresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, &apigen.CreatePresignMultipartUploadParams{ 280 Path: u.objectPath, 281 Parts: swag.Int(u.numParts), 282 }) 283 if err != nil { 284 return nil, fmt.Errorf("create presign multipart upload: %w", err) 285 } 286 if resp.JSON201 == nil { 287 return nil, fmt.Errorf("create presign multipart upload: %w", ResponseAsError(resp)) 288 } 289 290 // verify we got the expected number of presigned urls 291 mpu := resp.JSON201 292 var presignedUrls []string 293 if mpu.PresignedUrls != nil { 294 presignedUrls = *mpu.PresignedUrls 295 } 296 if len(presignedUrls) != u.numParts { 297 return nil, fmt.Errorf("create presign multipart upload: %w, expected %d presigned urls, got %d", ErrRequestFailed, u.numParts, len(presignedUrls)) 298 } 299 return mpu, nil 300 } 301 302 func (u *presignUpload) uploadPart(ctx context.Context, partReader *io.SectionReader, partURL string) (string, error) { 303 req, err := http.NewRequestWithContext(ctx, http.MethodPut, partURL, partReader) 304 if err != nil { 305 return "", err 306 } 307 req.ContentLength = partReader.Size() 308 if u.contentType != "" { 309 req.Header.Set("Content-Type", u.contentType) 310 } 311 312 resp, err := u.uploader.HTTPClient.Do(req) 313 if err != nil { 314 return "", err 315 } 316 defer func() { _ = resp.Body.Close() }() 317 if !httputil.IsSuccessStatusCode(resp) { 318 return "", fmt.Errorf("upload %s part(%s): %w", partURL, resp.Status, ErrRequestFailed) 319 } 320 321 etag := extractEtagFromResponseHeader(resp.Header) 322 if etag == "" { 323 return "", fmt.Errorf("upload etag is missing %s: %w", partURL, ErrRequestFailed) 324 } 325 return etag, nil 326 } 327 328 func (u *presignUpload) uploadObject(ctx context.Context) (*apigen.ObjectStats, error) { 329 stagingLocation, err := getPhysicalAddress(ctx, u.uploader.Client, u.repoID, u.branchID, &apigen.GetPhysicalAddressParams{ 330 Path: u.objectPath, 331 Presign: swag.Bool(true), 332 }) 333 if err != nil { 334 return nil, err 335 } 336 preSignURL := swag.StringValue(stagingLocation.PresignedUrl) 337 338 var body io.ReadSeeker 339 if u.size > 0 { 340 // Passing Reader with content length == 0 results in 501 Not Implemented 341 body = u.reader 342 } 343 344 req, err := http.NewRequestWithContext(ctx, http.MethodPut, preSignURL, body) 345 if err != nil { 346 return nil, err 347 } 348 req.ContentLength = u.size 349 if u.contentType != "" { 350 req.Header.Set("Content-Type", u.contentType) 351 } 352 if isAzureBlobURL(req.URL) { 353 req.Header.Set("x-ms-blob-type", "BlockBlob") 354 } 355 356 putResp, err := u.uploader.HTTPClient.Do(req) 357 if err != nil { 358 return nil, err 359 } 360 defer func() { _ = putResp.Body.Close() }() 361 if !httputil.IsSuccessStatusCode(putResp) { 362 return nil, fmt.Errorf("upload %w %s: %s", ErrRequestFailed, preSignURL, putResp.Status) 363 } 364 365 etag := extractEtagFromResponseHeader(putResp.Header) 366 if etag == "" { 367 return nil, fmt.Errorf("upload %w %s: etag is missing", ErrRequestFailed, preSignURL) 368 } 369 370 linkReqBody := apigen.LinkPhysicalAddressJSONRequestBody{ 371 Checksum: etag, 372 SizeBytes: u.size, 373 Staging: *stagingLocation, 374 UserMetadata: &apigen.StagingMetadata_UserMetadata{ 375 AdditionalProperties: u.metadata, 376 }, 377 ContentType: apiutil.Ptr(u.contentType), 378 } 379 linkResp, err := u.uploader.Client.LinkPhysicalAddressWithResponse(ctx, u.repoID, u.branchID, 380 &apigen.LinkPhysicalAddressParams{ 381 Path: u.objectPath, 382 }, linkReqBody) 383 if err != nil { 384 return nil, fmt.Errorf("link object to backing store: %w", err) 385 } 386 if linkResp.JSON200 != nil { 387 return linkResp.JSON200, nil 388 } 389 if linkResp.JSON409 != nil { 390 return nil, ErrConflict 391 } 392 return nil, fmt.Errorf("link object to backing store: %w (%s)", ErrRequestFailed, linkResp.Status()) 393 } 394 395 func (u *presignUpload) Upload(ctx context.Context) (*apigen.ObjectStats, error) { 396 // use multipart upload if: 397 // 1. Multipart upload is supported by the server. 398 // 2. Reader supports ReaderAt. 399 // 3. Content size is greater than MinUploadPartSize. 400 if u.uploader.MultipartSupport && u.size >= MinUploadPartSize && u.readerAt != nil { 401 return u.uploadMultipart(ctx) 402 } 403 return u.uploadObject(ctx) 404 } 405 406 func ClientUploadPreSign(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID, branchID, objPath string, metadata map[string]string, contentType string, contents io.ReadSeeker, presignMultipartSupport bool) (*apigen.ObjectStats, error) { 407 // upload loop, retry on conflict 408 uploader := NewPreSignUploader(client, presignMultipartSupport) 409 for { 410 stats, err := uploader.Upload(ctx, repoID, branchID, objPath, contents, contentType, metadata) 411 if err == nil { 412 return stats, nil 413 } 414 // break in case of error other than conflict, otherwise retry 415 if !errors.Is(err, ErrConflict) { 416 return nil, err 417 } 418 } 419 } 420 421 func isAzureBlobURL(u *url.URL) bool { 422 _, _, err := azure.ParseURL(u) 423 return err == nil 424 } 425 426 // extractEtagFromResponseHeader extracts the ETag from the response header. 427 // If the response contains a Content-MD5 header, it will be decoded from base64 and returned as hex. 428 func extractEtagFromResponseHeader(h http.Header) string { 429 // prefer Content-MD5 if exists 430 contentMD5 := h.Get("Content-MD5") 431 if contentMD5 != "" { 432 // decode base64, return as hex 433 decodeMD5, err := base64.StdEncoding.DecodeString(contentMD5) 434 if err == nil { 435 return hex.EncodeToString(decodeMD5) 436 } 437 } 438 // fallback to ETag 439 etag := h.Get("ETag") 440 etag = strings.TrimFunc(etag, func(r rune) bool { return r == '"' || r == ' ' }) 441 return etag 442 } 443 444 func getPhysicalAddress(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID string, branchID string, params *apigen.GetPhysicalAddressParams) (*apigen.StagingLocation, error) { 445 resp, err := client.GetPhysicalAddressWithResponse(ctx, repoID, branchID, params) 446 if err != nil { 447 return nil, fmt.Errorf("get physical address to upload object: %w", err) 448 } 449 if resp.JSONDefault != nil { 450 return nil, fmt.Errorf("%w: %s", ErrRequestFailed, resp.JSONDefault.Message) 451 } 452 if resp.JSON200 == nil { 453 return nil, fmt.Errorf("%w: %s (status code %d)", ErrRequestFailed, resp.Status(), resp.StatusCode()) 454 } 455 return resp.JSON200, nil 456 }