github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/block/azure/walker.go (about)

     1  package azure
     2  
     3  import (
     4  	"context"
     5  	"encoding/hex"
     6  	"errors"
     7  	"fmt"
     8  	"net/url"
     9  	"strings"
    10  
    11  	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
    12  	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
    13  	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/service"
    14  	"github.com/treeverse/lakefs/pkg/block"
    15  )
    16  
    17  const DirectoryBlobMetadataKey = "hdi_isfolder"
    18  
    19  var ErrAzureInvalidURL = errors.New("invalid Azure storage URL")
    20  
    21  // extractAzurePrefix takes a URL that looks like this: https://storageaccount.blob.core.windows.net/container/prefix
    22  // and return the URL for the container and a prefix, if one exists
    23  func extractAzurePrefix(storageURI *url.URL) (*url.URL, string, error) {
    24  	path := strings.TrimLeft(storageURI.Path, "/")
    25  	if len(path) == 0 {
    26  		return nil, "", fmt.Errorf("%w: could not parse container URL: %s", ErrAzureInvalidURL, storageURI)
    27  	}
    28  	parts := strings.SplitN(path, "/", 2) // nolint: mnd
    29  	if len(parts) == 1 {
    30  		// we only have a container
    31  		return storageURI, "", nil
    32  	}
    33  	// we have both prefix and storage container, rebuild URL
    34  	relativePath := url.URL{Path: "/" + parts[0]}
    35  	return storageURI.ResolveReference(&relativePath), parts[1], nil
    36  }
    37  
    38  func getAzureBlobURL(containerURL *url.URL, blobName string) *url.URL {
    39  	relativePath := url.URL{Path: containerURL.Path + "/" + blobName}
    40  	return containerURL.ResolveReference(&relativePath)
    41  }
    42  
    43  // isBlobItemFolder returns true if the blob item is a folder.
    44  // Make sure that metadata is populated before calling this function.
    45  // Example: for listing using blob API passing options with `Include: container.ListBlobsInclude{ Metadata: true }`
    46  // will populate the metadata.
    47  func isBlobItemFolder(blobItem *container.BlobItem) bool {
    48  	if blobItem.Metadata == nil {
    49  		return false
    50  	}
    51  	if blobItem.Properties.ContentLength != nil && *blobItem.Properties.ContentLength != 0 {
    52  		return false
    53  	}
    54  	isFolder, ok := blobItem.Metadata[DirectoryBlobMetadataKey]
    55  	if !ok || isFolder == nil {
    56  		return false
    57  	}
    58  	return *isFolder == "true"
    59  }
    60  
    61  // extractBlobItemEtag etag set by content md5 with fallback to use Etag value
    62  func extractBlobItemEtag(blobItem *container.BlobItem) string {
    63  	if blobItem.Properties.ContentMD5 != nil {
    64  		return hex.EncodeToString(blobItem.Properties.ContentMD5)
    65  	}
    66  	if blobItem.Properties.ETag != nil {
    67  		etag := string(*blobItem.Properties.ETag)
    68  		return strings.TrimFunc(etag, func(r rune) bool { return r == '"' || r == ' ' })
    69  	}
    70  	return ""
    71  }
    72  
    73  //
    74  // DataLakeWalker
    75  //
    76  
    77  func NewAzureDataLakeWalker(svc *service.Client, skipOutOfOrder bool) (*DataLakeWalker, error) {
    78  	return &DataLakeWalker{
    79  		client:         svc,
    80  		mark:           block.Mark{HasMore: true},
    81  		skipOutOfOrder: skipOutOfOrder,
    82  	}, nil
    83  }
    84  
    85  type DataLakeWalker struct {
    86  	client         *service.Client
    87  	mark           block.Mark
    88  	skipped        []block.ObjectStoreEntry
    89  	skipOutOfOrder bool
    90  }
    91  
    92  func (a *DataLakeWalker) Walk(ctx context.Context, storageURI *url.URL, op block.WalkOptions, walkFn func(e block.ObjectStoreEntry) error) error {
    93  	// we use bucket as container and prefix as a path
    94  	containerURL, prefix, err := extractAzurePrefix(storageURI)
    95  	if err != nil {
    96  		return err
    97  	}
    98  	var basePath string
    99  	if idx := strings.LastIndex(prefix, "/"); idx != -1 {
   100  		basePath = prefix[:idx+1]
   101  	}
   102  
   103  	qk, err := ResolveBlobURLInfoFromURL(containerURL)
   104  	if err != nil {
   105  		return err
   106  	}
   107  
   108  	containerClient := a.client.NewContainerClient(qk.ContainerName)
   109  	listBlob := containerClient.NewListBlobsFlatPager(&azblob.ListBlobsFlatOptions{
   110  		Prefix: &prefix,
   111  		Marker: &op.ContinuationToken,
   112  		Include: container.ListBlobsInclude{
   113  			Metadata: true,
   114  		},
   115  	})
   116  
   117  	skipCount := 0
   118  	prev := ""
   119  	for listBlob.More() {
   120  		resp, err := listBlob.NextPage(ctx)
   121  		if err != nil {
   122  			return err
   123  		}
   124  		if resp.Marker != nil {
   125  			a.mark.ContinuationToken = *resp.Marker
   126  		}
   127  		for _, blobInfo := range resp.Segment.BlobItems {
   128  			// skipping everything in the page which is before 'After' (without forgetting the possible empty string key!)
   129  			if op.After != "" && *blobInfo.Name <= op.After {
   130  				continue
   131  			}
   132  
   133  			// Skip folders
   134  			if isBlobItemFolder(blobInfo) {
   135  				continue
   136  			}
   137  
   138  			entry := block.ObjectStoreEntry{
   139  				FullKey:     *blobInfo.Name,
   140  				RelativeKey: strings.TrimPrefix(*blobInfo.Name, basePath),
   141  				Address:     getAzureBlobURL(containerURL, *blobInfo.Name).String(),
   142  				ETag:        extractBlobItemEtag(blobInfo),
   143  				Mtime:       *blobInfo.Properties.LastModified,
   144  				Size:        *blobInfo.Properties.ContentLength,
   145  			}
   146  			if a.skipOutOfOrder && strings.Compare(prev, *blobInfo.Name) > 0 { // skip out of order
   147  				a.skipped = append(a.skipped, entry)
   148  				skipCount++
   149  				continue
   150  			}
   151  			prev = *blobInfo.Name
   152  
   153  			a.mark.LastKey = *blobInfo.Name
   154  			if err := walkFn(entry); err != nil {
   155  				return err
   156  			}
   157  		}
   158  	}
   159  	a.mark = block.Mark{
   160  		HasMore: false,
   161  	}
   162  
   163  	return nil
   164  }
   165  
   166  func (a *DataLakeWalker) Marker() block.Mark {
   167  	return a.mark
   168  }
   169  
   170  func (a *DataLakeWalker) GetSkippedEntries() []block.ObjectStoreEntry {
   171  	return a.skipped
   172  }