github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/s3file/versions_list.go (about) 1 package s3file 2 3 import ( 4 "context" 5 "strings" 6 "time" 7 8 "github.com/aws/aws-sdk-go/aws" 9 "github.com/aws/aws-sdk-go/service/s3" 10 "github.com/Schaudge/grailbase/errors" 11 "github.com/Schaudge/grailbase/file" 12 "github.com/Schaudge/grailbase/file/fsnode" 13 "github.com/Schaudge/grailbase/grail/biofs/biofseventlog" 14 "github.com/Schaudge/grailbase/must" 15 ) 16 17 // s3Query is a generic description of an S3 object or prefix. 18 type s3Query struct { 19 impl *s3Impl 20 // bucket must be non-empty. 21 bucket string 22 // key is either an S3 object's key or a key prefix (optionally ending with pathSeparator). 23 // "" is allowed and refers to the root of the bucket. 24 key string 25 } 26 27 func (q s3Query) path() string { return pathPrefix + q.bucket + pathSeparator + q.key } 28 29 // TODO: Dedupe with gfilefs. 30 const fileInfoCacheFor = 1 * time.Hour 31 32 type ( 33 // versionsDirViewGen lists all the versions of all the (direct child) objects in a single S3 34 // "directory" (that is, single bucket with a single key prefix). 35 // 36 // Note: We implement fsnode.ChildrenGenerator rather than fsnode.Iterator because it reduces 37 // implementation complexity. We need to parse three separate fields from listing responses 38 // so the implementation is a bit verbose, and Child()/Children() differences introduce edge 39 // cases we should test. But, we'll probably want to do this eventually. 40 versionsDirViewGen struct{ s3Query } 41 42 // versionsObjectGen lists the versions of an S3 object. Each version of the object is accessible 43 // via a child node. Additionally, if there are other S3 object versions that have this path as 44 // a prefix (or, in directory terms, if there used to be a directory with the same name as this 45 // file), a dir/ child provides access to those. 46 // 47 // Scheme: 48 // vVERSION_ID/ for each version 49 // vVERSION_ID (empty file) to mark deletion time 50 // dir/ for children, if there used to be a "directory" with this name 51 // TODO: 52 // @DATE/ -> VERSION_ID/ for each version 53 // latest/ -> VERSION_ID/ 54 // 0/, 1/, etc. -> VERSION_ID/ 55 // 56 // Note: We implement fsnode.ChildrenGenerator rather than fsnode.Iterator because it reduces 57 // implementation complexity and we expect number of versions per object to be relatively 58 // modest in practice. If we see performance problems, we can make it more sophisticated. 59 versionsObjViewGen struct{ s3Query } 60 ) 61 62 var ( 63 _ fsnode.ChildrenGenerator = versionsDirViewGen{} 64 _ fsnode.ChildrenGenerator = versionsObjViewGen{} 65 66 objViewDirInfo = fsnode.NewDirInfo("dir").WithCacheableFor(fileInfoCacheFor) 67 ) 68 69 func (g versionsDirViewGen) GenerateChildren(ctx context.Context) ([]fsnode.T, error) { 70 biofseventlog.UsedFeature("s3.versions.dirview") 71 dirPrefix := g.key 72 if dirPrefix != "" { 73 dirPrefix = g.key + pathSeparator 74 } 75 iterator, err := newVersionsIterator(ctx, g.impl, g.s3Query, s3.ListObjectVersionsInput{ 76 Bucket: aws.String(g.bucket), 77 Delimiter: aws.String(pathSeparator), 78 Prefix: aws.String(dirPrefix), 79 }) 80 if err != nil { 81 return nil, err 82 } 83 var ( 84 dirChildren = map[string]fsnode.T{} 85 objChildren = map[string][]fsnode.T{} 86 ) 87 for iterator.HasNextPage() { 88 out, err := iterator.NextPage(ctx) 89 if err != nil { 90 return nil, err 91 } 92 for _, common := range out.CommonPrefixes { 93 name := (*common.Prefix)[len(dirPrefix):] 94 name = name[:len(name)-len(pathSeparator)] 95 if name == "" { 96 // Note: S3 keys may have multiple trailing `/`s leading to name == "". 97 // For now, we skip these, making them inaccessible to users. 98 // TODO: Better mapping of S3 key semantics onto fsnode.T, for example recursively 99 // listing "key//" so we can merge those children into "key/"'s. 100 // See also: BXDS-2039 for the non-version listing case. 101 continue 102 } 103 q := g.s3Query 104 q.key = dirPrefix + name 105 dirChildren[name] = fsnode.NewParent( 106 fsnode.NewDirInfo(name).WithCacheableFor(fileInfoCacheFor), 107 versionsDirViewGen{q}) 108 } 109 for _, del := range out.DeleteMarkers { 110 if *del.Key == dirPrefix { 111 continue // Skip directory markers. 112 } 113 name := (*del.Key)[len(dirPrefix):] 114 objChildren[name] = append(objChildren[name], newDeleteChild(del)) 115 } 116 for _, version := range out.Versions { 117 if *version.Key == dirPrefix { 118 continue // Skip directory markers. 119 } 120 q := g.s3Query 121 q.key = *version.Key 122 name := q.key[len(dirPrefix):] 123 objChildren[name] = append(objChildren[name], newVersionChild(q, version)) 124 } 125 } 126 merged := make([]fsnode.T, 0, len(dirChildren)+len(objChildren)) 127 for name, child := range dirChildren { 128 if _, ok := objChildren[name]; ok { 129 // If a name was used both for files and directories, prefer files here, because 130 // the user can find the directory view under {name}/dir/. 131 continue 132 } 133 merged = append(merged, child) 134 } 135 for name, children := range objChildren { 136 merged = append(merged, fsnode.NewParent( 137 fsnode.NewDirInfo(name).WithCacheableFor(fileInfoCacheFor), 138 fsnode.ConstChildren(children...), 139 )) 140 } 141 return merged, nil 142 } 143 144 func (g versionsObjViewGen) GenerateChildren(ctx context.Context) ([]fsnode.T, error) { 145 biofseventlog.UsedFeature("s3.versions.objview") 146 iterator, err := newVersionsIterator(ctx, g.impl, g.s3Query, s3.ListObjectVersionsInput{ 147 Bucket: aws.String(g.bucket), 148 Delimiter: aws.String(pathSeparator), 149 Prefix: aws.String(g.key), 150 }) 151 if err != nil { 152 return nil, err 153 } 154 var ( 155 versions []fsnode.T 156 hasOtherChildren bool 157 ) 158 for iterator.HasNextPage() { 159 out, err := iterator.NextPage(ctx) 160 if err != nil { 161 return nil, err 162 } 163 if len(out.CommonPrefixes) > 0 { 164 hasOtherChildren = true 165 } 166 for _, del := range out.DeleteMarkers { 167 if *del.Key != g.key { 168 hasOtherChildren = true 169 // del is in a "subdirectory" of a previous directory version of our object. 170 // We don't render those here; instead we just add the dir/ child below. 171 continue 172 // Note: It seems like S3 returns delete markers in sorted order, but the API 173 // docs don't explicitly state this for ListObjectVersions [1] as they do for 174 // ListObjectsV2 [2], so we `continue` instead of `break`. We're still assuming 175 // API response pages are so ordered, though, because the alternative is unworkable. 176 // TODO: Ask AWS for explicit documentation on versions ordering. 177 // 178 // [1] https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html 179 // [2] https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html 180 } 181 versions = append(versions, newDeleteChild(del)) 182 } 183 for _, version := range out.Versions { 184 if *version.Key != g.key { 185 hasOtherChildren = true 186 continue // See delete marker note. 187 } 188 versions = append(versions, newVersionChild(g.s3Query, version)) 189 } 190 } 191 if hasOtherChildren { 192 versions = append(versions, fsnode.NewParent(objViewDirInfo, versionsDirViewGen{g.s3Query})) 193 } 194 return versions, nil 195 } 196 197 func newVersionChild(q s3Query, v *s3.ObjectVersion) fsnode.Parent { 198 must.Truef(len(q.key) > 0, "creating child for %#v, %v", q, v) 199 name := q.key 200 if idx := strings.LastIndex(name, pathSeparator); idx >= 0 { 201 name = name[idx+len(pathSeparator):] 202 } 203 dirName := "v" + sanitizePathElem(*v.VersionId) 204 // Some S3 storage classes don't allow immediate, direct access (for example, requiring restore 205 // first). They also have very different cost profiles and users may not know about these and 206 // accidentally, expensively download many objects (especially with biofs, where it's easy 207 // to run `grep`, etc.). We have a best-effort allowlist and block others for now. 208 // TODO: Refine this UX. Maybe add a README.txt describing these properties and suggesting 209 // using the AWS console for unsupported objects. 210 // TODO: Consider supporting Glacier restoration. 211 // 212 // Note: This field's `enum` tag [1] names an enum with only one value (standard class [2]). 213 // However, as of this writing, we're seeing the API return more values, like DEEP_ARCHIVE. 214 // We assume it can take any value in s3.ObjectStorageClass* instead. 215 // TODO: Verify this, report to AWS, etc. 216 // [1] https://pkg.go.dev/github.com/aws/aws-sdk-go@v1.42.0/service/s3#ObjectVersion.StorageClass 217 // [2] https://pkg.go.dev/github.com/aws/aws-sdk-go@v1.42.0/service/s3#ObjectVersionStorageClassStandard 218 switch *v.StorageClass { 219 default: 220 dirName += "." + *v.StorageClass 221 return fsnode.NewParent( 222 fsnode.NewDirInfo(dirName).WithModTime(*v.LastModified), 223 fsnode.ConstChildren()) 224 case 225 s3.ObjectStorageClassStandard, 226 s3.ObjectStorageClassReducedRedundancy, 227 s3.ObjectStorageClassStandardIa, 228 s3.ObjectStorageClassOnezoneIa, 229 s3.ObjectStorageClassIntelligentTiering: 230 return fsnode.NewParent( 231 fsnode.NewDirInfo(dirName).WithModTime(*v.LastModified), 232 fsnode.ConstChildren( 233 versionsLeaf{ 234 FileInfo: fsnode.NewRegInfo(name).WithSize(*v.Size).WithModTime(*v.LastModified), 235 s3Query: q, 236 versionID: *v.VersionId, 237 }, 238 ), 239 ) 240 } 241 } 242 243 func newDeleteChild(del *s3.DeleteMarkerEntry) fsnode.T { 244 return fsnode.ConstLeaf( 245 fsnode.NewRegInfo("v"+sanitizePathElem(*del.VersionId)).WithModTime(*del.LastModified), 246 nil) 247 } 248 249 type versionsIterator struct { 250 in s3.ListObjectVersionsInput 251 eof bool 252 policy retryPolicy 253 path string 254 } 255 256 func newVersionsIterator( 257 ctx context.Context, 258 impl *s3Impl, 259 q s3Query, 260 in s3.ListObjectVersionsInput, 261 ) (*versionsIterator, error) { 262 clients, err := impl.clientsForAction(ctx, "ListVersions", q.bucket, q.key) 263 if err != nil { 264 return nil, errors.E(err, "getting clients") 265 } 266 policy := newBackoffPolicy(clients, file.Opts{}) 267 return &versionsIterator{in: in, policy: policy, path: q.path()}, nil 268 } 269 270 func (it *versionsIterator) HasNextPage() bool { return !it.eof } 271 272 func (it *versionsIterator) NextPage(ctx context.Context) (*s3.ListObjectVersionsOutput, error) { 273 for { 274 var ids s3RequestIDs 275 out, err := it.policy.client().ListObjectVersionsWithContext(ctx, &it.in, ids.captureOption()) 276 if err == nil { 277 it.in.KeyMarker = out.NextKeyMarker 278 it.in.VersionIdMarker = out.NextVersionIdMarker 279 if !*out.IsTruncated { 280 it.eof = true 281 } 282 return out, nil 283 } 284 if !it.policy.shouldRetry(ctx, err, it.path) { 285 it.eof = true 286 return nil, annotate(err, ids, &it.policy, "s3file.versionsRootNode.Child", it.path) 287 } 288 } 289 } 290 291 func sanitizePathElem(s string) string { 292 // TODO: Consider being stricter. S3 guarantees very little about version IDs: 293 // https://docs.aws.amazon.com/AmazonS3/latest/userguide/versioning-workflows.html#version-ids 294 // TODO: Implement more robust replacement (with some escape char, etc.) so that we cannot 295 // introduce collisions. 296 return strings.ReplaceAll(s, "/", "_") 297 }