github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/api/helpers/upload.go (about)

     1  // Package helpers provide useful wrappers for clients using the lakeFS OpenAPI.
     2  package helpers
     3  
     4  import (
     5  	"context"
     6  	"encoding/base64"
     7  	"encoding/hex"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"mime"
    12  	"mime/multipart"
    13  	"net/http"
    14  	"net/textproto"
    15  	"net/url"
    16  	"path/filepath"
    17  	"strings"
    18  
    19  	"github.com/go-openapi/swag"
    20  	"github.com/treeverse/lakefs/pkg/api/apigen"
    21  	"github.com/treeverse/lakefs/pkg/api/apiutil"
    22  	"github.com/treeverse/lakefs/pkg/block/azure"
    23  	"github.com/treeverse/lakefs/pkg/httputil"
    24  	"golang.org/x/sync/errgroup"
    25  )
    26  
    27  // MaxUploadParts is the maximum allowed number of parts in a multipart upload
    28  const MaxUploadParts int32 = 10000
    29  
    30  // MinUploadPartSize is the minimum allowed part size when uploading a part
    31  const MinUploadPartSize int64 = 1024 * 1024 * 5
    32  
    33  // DefaultUploadPartSize is the default part size to buffer chunks of a payload into
    34  const DefaultUploadPartSize = MinUploadPartSize
    35  
    36  // DefaultUploadConcurrency is the default number of goroutines to spin up when uploading a multipart upload
    37  const DefaultUploadConcurrency = 5
    38  
    39  // ClientUpload uploads contents as a file via lakeFS
    40  func ClientUpload(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID, branchID, objPath string, metadata map[string]string, contentType string, contents io.ReadSeeker) (*apigen.ObjectStats, error) {
    41  	pr, pw := io.Pipe()
    42  	defer func() {
    43  		_ = pr.Close()
    44  	}()
    45  
    46  	mpw := multipart.NewWriter(pw)
    47  	mpContentType := mpw.FormDataContentType()
    48  	go func() {
    49  		defer func() {
    50  			_ = mpw.Close()
    51  			_ = pw.Close()
    52  		}()
    53  
    54  		filename := filepath.Base(objPath)
    55  		const fieldName = "content"
    56  		var err error
    57  		var cw io.Writer
    58  		// when no content-type is specified, we let 'CreateFromFile' add the part with the default content type.
    59  		// otherwise, we add a part and set the content-type.
    60  		if contentType != "" {
    61  			h := make(textproto.MIMEHeader)
    62  			contentDisposition := mime.FormatMediaType("form-data", map[string]string{"name": fieldName, "filename": filename})
    63  			h.Set("Content-Disposition", contentDisposition)
    64  			h.Set("Content-Type", contentType)
    65  			cw, err = mpw.CreatePart(h)
    66  		} else {
    67  			cw, err = mpw.CreateFormFile(fieldName, filename)
    68  		}
    69  		if err != nil {
    70  			_ = pw.CloseWithError(err)
    71  			return
    72  		}
    73  		if _, err := io.Copy(cw, contents); err != nil {
    74  			_ = pw.CloseWithError(err)
    75  			return
    76  		}
    77  	}()
    78  
    79  	resp, err := client.UploadObjectWithBodyWithResponse(ctx, repoID, branchID, &apigen.UploadObjectParams{
    80  		Path: objPath,
    81  	}, mpContentType, pr, func(ctx context.Context, req *http.Request) error {
    82  		var metaKey string
    83  		for k, v := range metadata {
    84  			lowerKey := strings.ToLower(k)
    85  			if strings.HasPrefix(lowerKey, apiutil.LakeFSMetadataPrefix) {
    86  				metaKey = apiutil.LakeFSHeaderInternalPrefix + lowerKey[len(apiutil.LakeFSMetadataPrefix):]
    87  			} else {
    88  				metaKey = apiutil.LakeFSHeaderMetadataPrefix + lowerKey
    89  			}
    90  			req.Header.Set(metaKey, v)
    91  		}
    92  		return nil
    93  	})
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  	if resp.JSON201 == nil {
    98  		return nil, ResponseAsError(resp)
    99  	}
   100  	return resp.JSON201, nil
   101  }
   102  
   103  // PreSignUploader uploads contents as a file via lakeFS using presigned urls
   104  // It supports both multipart and single part uploads.
   105  type PreSignUploader struct {
   106  	Concurrency      int
   107  	HTTPClient       *http.Client
   108  	Client           apigen.ClientWithResponsesInterface
   109  	MultipartSupport bool
   110  }
   111  
   112  // presignUpload represents a single upload request
   113  type presignUpload struct {
   114  	uploader    PreSignUploader
   115  	repoID      string
   116  	branchID    string
   117  	objectPath  string
   118  	metadata    map[string]string
   119  	contentType string
   120  	reader      io.ReadSeeker
   121  	readerAt    io.ReaderAt
   122  	size        int64
   123  	partSize    int64
   124  	numParts    int
   125  }
   126  
   127  func NewPreSignUploader(client apigen.ClientWithResponsesInterface, multipartSupport bool) *PreSignUploader {
   128  	return &PreSignUploader{
   129  		Concurrency:      DefaultUploadConcurrency,
   130  		HTTPClient:       http.DefaultClient,
   131  		Client:           client,
   132  		MultipartSupport: multipartSupport,
   133  	}
   134  }
   135  
   136  func (u *PreSignUploader) Upload(ctx context.Context, repoID string, branchID string, objPath string, content io.ReadSeeker,
   137  	contentType string, metadata map[string]string,
   138  ) (*apigen.ObjectStats, error) {
   139  	// calculate size and rewind content
   140  	size, err := content.Seek(0, io.SeekEnd)
   141  	if err != nil {
   142  		return nil, err
   143  	}
   144  	if _, seekErr := content.Seek(0, io.SeekStart); seekErr != nil {
   145  		return nil, seekErr
   146  	}
   147  	// check if content implements io.ReaderAt for multipart upload
   148  	readerAt, _ := content.(io.ReaderAt)
   149  
   150  	// create upload object - represents a single upload request
   151  	upload := &presignUpload{
   152  		uploader:    *u,
   153  		repoID:      repoID,
   154  		branchID:    branchID,
   155  		objectPath:  objPath,
   156  		metadata:    metadata,
   157  		contentType: contentType,
   158  		reader:      content,
   159  		readerAt:    readerAt,
   160  		size:        size,
   161  	}
   162  	return upload.Upload(ctx)
   163  }
   164  
   165  type presignPartReader struct {
   166  	Reader *io.SectionReader
   167  	URL    string
   168  }
   169  
   170  func (u *presignUpload) uploadMultipart(ctx context.Context) (*apigen.ObjectStats, error) {
   171  	mpu, err := u.initMultipart(ctx)
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	// prepare readers
   177  	parts := make([]presignPartReader, u.numParts)
   178  	rem := u.size
   179  	off := int64(0)
   180  	for i := range parts {
   181  		size := min(u.partSize, rem)
   182  		parts[i].Reader = io.NewSectionReader(u.readerAt, off, size) // use `readerAt`
   183  		parts[i].URL = (*mpu.PresignedUrls)[i]
   184  		rem -= size
   185  		off += size
   186  	}
   187  
   188  	// upload parts in parallel, fill uploadParts array with results
   189  	uploadParts := make([]apigen.UploadPart, u.numParts)
   190  	g, grpCtx := errgroup.WithContext(context.Background())
   191  	g.SetLimit(u.uploader.Concurrency)
   192  
   193  	for i := 0; i < u.numParts; i++ {
   194  		i := i // pinning
   195  		g.Go(func() error {
   196  			etag, err := u.uploadPart(grpCtx, parts[i].Reader, parts[i].URL)
   197  			if err != nil {
   198  				return fmt.Errorf("part %d %w", i+1, err)
   199  			}
   200  			uploadParts[i] = apigen.UploadPart{
   201  				PartNumber: i + 1,
   202  				Etag:       etag,
   203  			}
   204  			return nil
   205  		})
   206  	}
   207  
   208  	// wait for all parts to be uploaded
   209  	if err := g.Wait(); err != nil {
   210  		// abort upload using a new context to avoid context cancellation
   211  		abortErr := u.abortMultipart(context.Background(), mpu)
   212  		if abortErr != nil {
   213  			err = errors.Join(err, abortErr)
   214  		}
   215  		return nil, err
   216  	}
   217  
   218  	return u.completeMultipart(ctx, mpu, uploadParts)
   219  }
   220  
   221  func (u *presignUpload) completeMultipart(ctx context.Context, mpu *apigen.PresignMultipartUpload, uploadParts []apigen.UploadPart) (*apigen.ObjectStats, error) {
   222  	resp, err := u.uploader.Client.CompletePresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, mpu.UploadId,
   223  		&apigen.CompletePresignMultipartUploadParams{
   224  			Path: u.objectPath,
   225  		},
   226  		apigen.CompletePresignMultipartUploadJSONRequestBody{
   227  			Parts:           uploadParts,
   228  			PhysicalAddress: mpu.PhysicalAddress,
   229  			UserMetadata: &apigen.CompletePresignMultipartUpload_UserMetadata{
   230  				AdditionalProperties: u.metadata,
   231  			},
   232  			ContentType: apiutil.Ptr(u.contentType),
   233  		},
   234  	)
   235  	if err != nil {
   236  		return nil, fmt.Errorf("complete presign multipart upload: %w", err)
   237  	}
   238  	if resp.JSON409 != nil {
   239  		return nil, ErrConflict
   240  	}
   241  	if resp.JSON200 == nil {
   242  		return nil, fmt.Errorf("complete presign multipart upload: %w", ResponseAsError(resp))
   243  	}
   244  	return resp.JSON200, nil
   245  }
   246  
   247  func (u *presignUpload) abortMultipart(ctx context.Context, mpu *apigen.PresignMultipartUpload) error {
   248  	resp, err := u.uploader.Client.AbortPresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, mpu.UploadId,
   249  		&apigen.AbortPresignMultipartUploadParams{
   250  			Path: u.objectPath,
   251  		},
   252  		apigen.AbortPresignMultipartUploadJSONRequestBody{
   253  			PhysicalAddress: mpu.PhysicalAddress,
   254  		})
   255  	if err != nil {
   256  		return fmt.Errorf("abort presign multipart upload: %w", err)
   257  	}
   258  	if resp.StatusCode() != http.StatusNoContent {
   259  		return fmt.Errorf("abort presign multipart upload: %w", ResponseAsError(resp))
   260  	}
   261  	return nil
   262  }
   263  
   264  func (u *presignUpload) initMultipart(ctx context.Context) (*apigen.PresignMultipartUpload, error) {
   265  	// adjust part size
   266  	u.partSize = DefaultUploadPartSize
   267  	if u.size/u.partSize >= int64(MaxUploadParts) {
   268  		// Add one to the part size to account for remainders
   269  		u.partSize = (u.size / int64(MaxUploadParts)) + 1
   270  	}
   271  
   272  	// calculate the number of parts
   273  	u.numParts = int(u.size / u.partSize)
   274  	if u.size%u.partSize != 0 {
   275  		u.numParts++
   276  	}
   277  
   278  	// create presign multipart upload
   279  	resp, err := u.uploader.Client.CreatePresignMultipartUploadWithResponse(ctx, u.repoID, u.branchID, &apigen.CreatePresignMultipartUploadParams{
   280  		Path:  u.objectPath,
   281  		Parts: swag.Int(u.numParts),
   282  	})
   283  	if err != nil {
   284  		return nil, fmt.Errorf("create presign multipart upload: %w", err)
   285  	}
   286  	if resp.JSON201 == nil {
   287  		return nil, fmt.Errorf("create presign multipart upload: %w", ResponseAsError(resp))
   288  	}
   289  
   290  	// verify we got the expected number of presigned urls
   291  	mpu := resp.JSON201
   292  	var presignedUrls []string
   293  	if mpu.PresignedUrls != nil {
   294  		presignedUrls = *mpu.PresignedUrls
   295  	}
   296  	if len(presignedUrls) != u.numParts {
   297  		return nil, fmt.Errorf("create presign multipart upload: %w, expected %d presigned urls, got %d", ErrRequestFailed, u.numParts, len(presignedUrls))
   298  	}
   299  	return mpu, nil
   300  }
   301  
   302  func (u *presignUpload) uploadPart(ctx context.Context, partReader *io.SectionReader, partURL string) (string, error) {
   303  	req, err := http.NewRequestWithContext(ctx, http.MethodPut, partURL, partReader)
   304  	if err != nil {
   305  		return "", err
   306  	}
   307  	req.ContentLength = partReader.Size()
   308  	if u.contentType != "" {
   309  		req.Header.Set("Content-Type", u.contentType)
   310  	}
   311  
   312  	resp, err := u.uploader.HTTPClient.Do(req)
   313  	if err != nil {
   314  		return "", err
   315  	}
   316  	defer func() { _ = resp.Body.Close() }()
   317  	if !httputil.IsSuccessStatusCode(resp) {
   318  		return "", fmt.Errorf("upload %s part(%s): %w", partURL, resp.Status, ErrRequestFailed)
   319  	}
   320  
   321  	etag := extractEtagFromResponseHeader(resp.Header)
   322  	if etag == "" {
   323  		return "", fmt.Errorf("upload etag is missing %s: %w", partURL, ErrRequestFailed)
   324  	}
   325  	return etag, nil
   326  }
   327  
   328  func (u *presignUpload) uploadObject(ctx context.Context) (*apigen.ObjectStats, error) {
   329  	stagingLocation, err := getPhysicalAddress(ctx, u.uploader.Client, u.repoID, u.branchID, &apigen.GetPhysicalAddressParams{
   330  		Path:    u.objectPath,
   331  		Presign: swag.Bool(true),
   332  	})
   333  	if err != nil {
   334  		return nil, err
   335  	}
   336  	preSignURL := swag.StringValue(stagingLocation.PresignedUrl)
   337  
   338  	var body io.ReadSeeker
   339  	if u.size > 0 {
   340  		// Passing Reader with content length == 0 results in 501 Not Implemented
   341  		body = u.reader
   342  	}
   343  
   344  	req, err := http.NewRequestWithContext(ctx, http.MethodPut, preSignURL, body)
   345  	if err != nil {
   346  		return nil, err
   347  	}
   348  	req.ContentLength = u.size
   349  	if u.contentType != "" {
   350  		req.Header.Set("Content-Type", u.contentType)
   351  	}
   352  	if isAzureBlobURL(req.URL) {
   353  		req.Header.Set("x-ms-blob-type", "BlockBlob")
   354  	}
   355  
   356  	putResp, err := u.uploader.HTTPClient.Do(req)
   357  	if err != nil {
   358  		return nil, err
   359  	}
   360  	defer func() { _ = putResp.Body.Close() }()
   361  	if !httputil.IsSuccessStatusCode(putResp) {
   362  		return nil, fmt.Errorf("upload %w %s: %s", ErrRequestFailed, preSignURL, putResp.Status)
   363  	}
   364  
   365  	etag := extractEtagFromResponseHeader(putResp.Header)
   366  	if etag == "" {
   367  		return nil, fmt.Errorf("upload %w %s: etag is missing", ErrRequestFailed, preSignURL)
   368  	}
   369  
   370  	linkReqBody := apigen.LinkPhysicalAddressJSONRequestBody{
   371  		Checksum:  etag,
   372  		SizeBytes: u.size,
   373  		Staging:   *stagingLocation,
   374  		UserMetadata: &apigen.StagingMetadata_UserMetadata{
   375  			AdditionalProperties: u.metadata,
   376  		},
   377  		ContentType: apiutil.Ptr(u.contentType),
   378  	}
   379  	linkResp, err := u.uploader.Client.LinkPhysicalAddressWithResponse(ctx, u.repoID, u.branchID,
   380  		&apigen.LinkPhysicalAddressParams{
   381  			Path: u.objectPath,
   382  		}, linkReqBody)
   383  	if err != nil {
   384  		return nil, fmt.Errorf("link object to backing store: %w", err)
   385  	}
   386  	if linkResp.JSON200 != nil {
   387  		return linkResp.JSON200, nil
   388  	}
   389  	if linkResp.JSON409 != nil {
   390  		return nil, ErrConflict
   391  	}
   392  	return nil, fmt.Errorf("link object to backing store: %w (%s)", ErrRequestFailed, linkResp.Status())
   393  }
   394  
   395  func (u *presignUpload) Upload(ctx context.Context) (*apigen.ObjectStats, error) {
   396  	// use multipart upload if:
   397  	// 1. Multipart upload is supported by the server.
   398  	// 2. Reader supports ReaderAt.
   399  	// 3. Content size is greater than MinUploadPartSize.
   400  	if u.uploader.MultipartSupport && u.size >= MinUploadPartSize && u.readerAt != nil {
   401  		return u.uploadMultipart(ctx)
   402  	}
   403  	return u.uploadObject(ctx)
   404  }
   405  
   406  func ClientUploadPreSign(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID, branchID, objPath string, metadata map[string]string, contentType string, contents io.ReadSeeker, presignMultipartSupport bool) (*apigen.ObjectStats, error) {
   407  	// upload loop, retry on conflict
   408  	uploader := NewPreSignUploader(client, presignMultipartSupport)
   409  	for {
   410  		stats, err := uploader.Upload(ctx, repoID, branchID, objPath, contents, contentType, metadata)
   411  		if err == nil {
   412  			return stats, nil
   413  		}
   414  		// break in case of error other than conflict, otherwise retry
   415  		if !errors.Is(err, ErrConflict) {
   416  			return nil, err
   417  		}
   418  	}
   419  }
   420  
   421  func isAzureBlobURL(u *url.URL) bool {
   422  	_, _, err := azure.ParseURL(u)
   423  	return err == nil
   424  }
   425  
   426  // extractEtagFromResponseHeader extracts the ETag from the response header.
   427  // If the response contains a Content-MD5 header, it will be decoded from base64 and returned as hex.
   428  func extractEtagFromResponseHeader(h http.Header) string {
   429  	// prefer Content-MD5 if exists
   430  	contentMD5 := h.Get("Content-MD5")
   431  	if contentMD5 != "" {
   432  		// decode base64, return as hex
   433  		decodeMD5, err := base64.StdEncoding.DecodeString(contentMD5)
   434  		if err == nil {
   435  			return hex.EncodeToString(decodeMD5)
   436  		}
   437  	}
   438  	// fallback to ETag
   439  	etag := h.Get("ETag")
   440  	etag = strings.TrimFunc(etag, func(r rune) bool { return r == '"' || r == ' ' })
   441  	return etag
   442  }
   443  
   444  func getPhysicalAddress(ctx context.Context, client apigen.ClientWithResponsesInterface, repoID string, branchID string, params *apigen.GetPhysicalAddressParams) (*apigen.StagingLocation, error) {
   445  	resp, err := client.GetPhysicalAddressWithResponse(ctx, repoID, branchID, params)
   446  	if err != nil {
   447  		return nil, fmt.Errorf("get physical address to upload object: %w", err)
   448  	}
   449  	if resp.JSONDefault != nil {
   450  		return nil, fmt.Errorf("%w: %s", ErrRequestFailed, resp.JSONDefault.Message)
   451  	}
   452  	if resp.JSON200 == nil {
   453  		return nil, fmt.Errorf("%w: %s (status code %d)", ErrRequestFailed, resp.Status(), resp.StatusCode())
   454  	}
   455  	return resp.JSON200, nil
   456  }