github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/block/azure/walker.go (about) 1 package azure 2 3 import ( 4 "context" 5 "encoding/hex" 6 "errors" 7 "fmt" 8 "net/url" 9 "strings" 10 11 "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" 12 "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" 13 "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/service" 14 "github.com/treeverse/lakefs/pkg/block" 15 ) 16 17 const DirectoryBlobMetadataKey = "hdi_isfolder" 18 19 var ErrAzureInvalidURL = errors.New("invalid Azure storage URL") 20 21 // extractAzurePrefix takes a URL that looks like this: https://storageaccount.blob.core.windows.net/container/prefix 22 // and return the URL for the container and a prefix, if one exists 23 func extractAzurePrefix(storageURI *url.URL) (*url.URL, string, error) { 24 path := strings.TrimLeft(storageURI.Path, "/") 25 if len(path) == 0 { 26 return nil, "", fmt.Errorf("%w: could not parse container URL: %s", ErrAzureInvalidURL, storageURI) 27 } 28 parts := strings.SplitN(path, "/", 2) // nolint: mnd 29 if len(parts) == 1 { 30 // we only have a container 31 return storageURI, "", nil 32 } 33 // we have both prefix and storage container, rebuild URL 34 relativePath := url.URL{Path: "/" + parts[0]} 35 return storageURI.ResolveReference(&relativePath), parts[1], nil 36 } 37 38 func getAzureBlobURL(containerURL *url.URL, blobName string) *url.URL { 39 relativePath := url.URL{Path: containerURL.Path + "/" + blobName} 40 return containerURL.ResolveReference(&relativePath) 41 } 42 43 // isBlobItemFolder returns true if the blob item is a folder. 44 // Make sure that metadata is populated before calling this function. 45 // Example: for listing using blob API passing options with `Include: container.ListBlobsInclude{ Metadata: true }` 46 // will populate the metadata. 47 func isBlobItemFolder(blobItem *container.BlobItem) bool { 48 if blobItem.Metadata == nil { 49 return false 50 } 51 if blobItem.Properties.ContentLength != nil && *blobItem.Properties.ContentLength != 0 { 52 return false 53 } 54 isFolder, ok := blobItem.Metadata[DirectoryBlobMetadataKey] 55 if !ok || isFolder == nil { 56 return false 57 } 58 return *isFolder == "true" 59 } 60 61 // extractBlobItemEtag etag set by content md5 with fallback to use Etag value 62 func extractBlobItemEtag(blobItem *container.BlobItem) string { 63 if blobItem.Properties.ContentMD5 != nil { 64 return hex.EncodeToString(blobItem.Properties.ContentMD5) 65 } 66 if blobItem.Properties.ETag != nil { 67 etag := string(*blobItem.Properties.ETag) 68 return strings.TrimFunc(etag, func(r rune) bool { return r == '"' || r == ' ' }) 69 } 70 return "" 71 } 72 73 // 74 // DataLakeWalker 75 // 76 77 func NewAzureDataLakeWalker(svc *service.Client, skipOutOfOrder bool) (*DataLakeWalker, error) { 78 return &DataLakeWalker{ 79 client: svc, 80 mark: block.Mark{HasMore: true}, 81 skipOutOfOrder: skipOutOfOrder, 82 }, nil 83 } 84 85 type DataLakeWalker struct { 86 client *service.Client 87 mark block.Mark 88 skipped []block.ObjectStoreEntry 89 skipOutOfOrder bool 90 } 91 92 func (a *DataLakeWalker) Walk(ctx context.Context, storageURI *url.URL, op block.WalkOptions, walkFn func(e block.ObjectStoreEntry) error) error { 93 // we use bucket as container and prefix as a path 94 containerURL, prefix, err := extractAzurePrefix(storageURI) 95 if err != nil { 96 return err 97 } 98 var basePath string 99 if idx := strings.LastIndex(prefix, "/"); idx != -1 { 100 basePath = prefix[:idx+1] 101 } 102 103 qk, err := ResolveBlobURLInfoFromURL(containerURL) 104 if err != nil { 105 return err 106 } 107 108 containerClient := a.client.NewContainerClient(qk.ContainerName) 109 listBlob := containerClient.NewListBlobsFlatPager(&azblob.ListBlobsFlatOptions{ 110 Prefix: &prefix, 111 Marker: &op.ContinuationToken, 112 Include: container.ListBlobsInclude{ 113 Metadata: true, 114 }, 115 }) 116 117 skipCount := 0 118 prev := "" 119 for listBlob.More() { 120 resp, err := listBlob.NextPage(ctx) 121 if err != nil { 122 return err 123 } 124 if resp.Marker != nil { 125 a.mark.ContinuationToken = *resp.Marker 126 } 127 for _, blobInfo := range resp.Segment.BlobItems { 128 // skipping everything in the page which is before 'After' (without forgetting the possible empty string key!) 129 if op.After != "" && *blobInfo.Name <= op.After { 130 continue 131 } 132 133 // Skip folders 134 if isBlobItemFolder(blobInfo) { 135 continue 136 } 137 138 entry := block.ObjectStoreEntry{ 139 FullKey: *blobInfo.Name, 140 RelativeKey: strings.TrimPrefix(*blobInfo.Name, basePath), 141 Address: getAzureBlobURL(containerURL, *blobInfo.Name).String(), 142 ETag: extractBlobItemEtag(blobInfo), 143 Mtime: *blobInfo.Properties.LastModified, 144 Size: *blobInfo.Properties.ContentLength, 145 } 146 if a.skipOutOfOrder && strings.Compare(prev, *blobInfo.Name) > 0 { // skip out of order 147 a.skipped = append(a.skipped, entry) 148 skipCount++ 149 continue 150 } 151 prev = *blobInfo.Name 152 153 a.mark.LastKey = *blobInfo.Name 154 if err := walkFn(entry); err != nil { 155 return err 156 } 157 } 158 } 159 a.mark = block.Mark{ 160 HasMore: false, 161 } 162 163 return nil 164 } 165 166 func (a *DataLakeWalker) Marker() block.Mark { 167 return a.mark 168 } 169 170 func (a *DataLakeWalker) GetSkippedEntries() []block.ObjectStoreEntry { 171 return a.skipped 172 }