github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/aws_table_persister.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "errors" 28 "fmt" 29 "io" 30 "net/url" 31 "sort" 32 "sync" 33 "time" 34 35 "github.com/aws/aws-sdk-go/aws" 36 "github.com/aws/aws-sdk-go/service/s3" 37 "github.com/aws/aws-sdk-go/service/s3/s3iface" 38 "github.com/aws/aws-sdk-go/service/s3/s3manager" 39 40 "github.com/dolthub/dolt/go/store/atomicerr" 41 "github.com/dolthub/dolt/go/store/chunks" 42 "github.com/dolthub/dolt/go/store/hash" 43 "github.com/dolthub/dolt/go/store/util/verbose" 44 ) 45 46 const ( 47 minS3PartSize = 5 * 1 << 20 // 5MiB 48 maxS3PartSize = 64 * 1 << 20 // 64MiB 49 maxS3Parts = 10000 50 51 defaultS3PartSize = minS3PartSize // smallest allowed by S3 allows for most throughput 52 ) 53 54 type awsTablePersister struct { 55 s3 s3iface.S3API 56 bucket string 57 rl chan struct{} 58 limits awsLimits 59 ns string 60 q MemoryQuotaProvider 61 } 62 63 var _ tablePersister = awsTablePersister{} 64 var _ tableFilePersister = awsTablePersister{} 65 66 type awsLimits struct { 67 partTarget, partMin, partMax uint64 68 } 69 70 func (s3p awsTablePersister) Open(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (chunkSource, error) { 71 return newAWSChunkSource( 72 ctx, 73 &s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, 74 s3p.limits, 75 name, 76 chunkCount, 77 s3p.q, 78 stats, 79 ) 80 } 81 82 func (s3p awsTablePersister) Exists(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (bool, error) { 83 return tableExistsInChunkSource( 84 ctx, 85 &s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, 86 s3p.limits, 87 name, 88 chunkCount, 89 s3p.q, 90 stats, 91 ) 92 } 93 94 func (s3p awsTablePersister) CopyTableFile(ctx context.Context, r io.Reader, fileId string, fileSz uint64, chunkCount uint32) error { 95 return s3p.multipartUpload(ctx, r, fileSz, fileId) 96 } 97 98 func (s3p awsTablePersister) Path() string { 99 return s3p.bucket 100 } 101 102 func (s3p awsTablePersister) AccessMode() chunks.ExclusiveAccessMode { 103 return chunks.ExclusiveAccessMode_Shared 104 } 105 106 type s3UploadedPart struct { 107 idx int64 108 etag string 109 } 110 111 func (s3p awsTablePersister) key(k string) string { 112 if s3p.ns != "" { 113 return s3p.ns + "/" + k 114 } 115 return k 116 } 117 118 func (s3p awsTablePersister) Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) { 119 name, data, chunkCount, err := mt.write(haver, stats) 120 121 if err != nil { 122 return emptyChunkSource{}, err 123 } 124 125 if chunkCount == 0 { 126 return emptyChunkSource{}, nil 127 } 128 129 err = s3p.multipartUpload(ctx, bytes.NewReader(data), uint64(len(data)), name.String()) 130 131 if err != nil { 132 return emptyChunkSource{}, err 133 } 134 135 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, name} 136 return newReaderFromIndexData(ctx, s3p.q, data, name, tra, s3BlockSize) 137 } 138 139 func (s3p awsTablePersister) multipartUpload(ctx context.Context, r io.Reader, sz uint64, key string) error { 140 uploader := s3manager.NewUploaderWithClient(s3p.s3, func(u *s3manager.Uploader) { 141 u.PartSize = int64(s3p.limits.partTarget) 142 }) 143 _, err := uploader.Upload(&s3manager.UploadInput{ 144 Bucket: aws.String(s3p.bucket), 145 Key: aws.String(s3p.key(key)), 146 Body: r, 147 }) 148 return err 149 } 150 151 func (s3p awsTablePersister) startMultipartUpload(ctx context.Context, key string) (string, error) { 152 result, err := s3p.s3.CreateMultipartUploadWithContext(ctx, &s3.CreateMultipartUploadInput{ 153 Bucket: aws.String(s3p.bucket), 154 Key: aws.String(s3p.key(key)), 155 }) 156 157 if err != nil { 158 return "", err 159 } 160 161 return *result.UploadId, nil 162 } 163 164 func (s3p awsTablePersister) abortMultipartUpload(ctx context.Context, key, uploadID string) error { 165 _, abrtErr := s3p.s3.AbortMultipartUploadWithContext(ctx, &s3.AbortMultipartUploadInput{ 166 Bucket: aws.String(s3p.bucket), 167 Key: aws.String(s3p.key(key)), 168 UploadId: aws.String(uploadID), 169 }) 170 171 return abrtErr 172 } 173 174 func (s3p awsTablePersister) completeMultipartUpload(ctx context.Context, key, uploadID string, mpu *s3.CompletedMultipartUpload) error { 175 _, err := s3p.s3.CompleteMultipartUploadWithContext(ctx, &s3.CompleteMultipartUploadInput{ 176 Bucket: aws.String(s3p.bucket), 177 Key: aws.String(s3p.key(key)), 178 MultipartUpload: mpu, 179 UploadId: aws.String(uploadID), 180 }) 181 182 return err 183 } 184 185 func getNumParts(dataLen, minPartSize uint64) uint64 { 186 numParts := dataLen / minPartSize 187 if numParts == 0 { 188 numParts = 1 189 } 190 return numParts 191 } 192 193 type partsByPartNum []*s3.CompletedPart 194 195 func (s partsByPartNum) Len() int { 196 return len(s) 197 } 198 199 func (s partsByPartNum) Less(i, j int) bool { 200 return *s[i].PartNumber < *s[j].PartNumber 201 } 202 203 func (s partsByPartNum) Swap(i, j int) { 204 s[i], s[j] = s[j], s[i] 205 } 206 207 func (s3p awsTablePersister) ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, cleanupFunc, error) { 208 plan, err := planRangeCopyConjoin(sources, stats) 209 if err != nil { 210 return nil, nil, err 211 } 212 213 if plan.chunkCount == 0 { 214 return emptyChunkSource{}, nil, nil 215 } 216 t1 := time.Now() 217 name := nameFromSuffixes(plan.suffixes()) 218 err = s3p.executeCompactionPlan(ctx, plan, name.String()) 219 220 if err != nil { 221 return nil, nil, err 222 } 223 224 verbose.Logger(ctx).Sugar().Debugf("Compacted table of %d Kb in %s", plan.totalCompressedData/1024, time.Since(t1)) 225 226 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, name} 227 cs, err := newReaderFromIndexData(ctx, s3p.q, plan.mergedIndex, name, tra, s3BlockSize) 228 return cs, func() {}, err 229 } 230 231 func (s3p awsTablePersister) executeCompactionPlan(ctx context.Context, plan compactionPlan, key string) error { 232 uploadID, err := s3p.startMultipartUpload(ctx, key) 233 234 if err != nil { 235 return err 236 } 237 238 multipartUpload, err := s3p.assembleTable(ctx, plan, key, uploadID) 239 if err != nil { 240 _ = s3p.abortMultipartUpload(ctx, key, uploadID) 241 return err 242 } 243 244 return s3p.completeMultipartUpload(ctx, key, uploadID, multipartUpload) 245 } 246 247 func (s3p awsTablePersister) assembleTable(ctx context.Context, plan compactionPlan, key, uploadID string) (*s3.CompletedMultipartUpload, error) { 248 if len(plan.sources.sws) > maxS3Parts { 249 return nil, errors.New("exceeded maximum parts") 250 } 251 252 // Separate plan.sources by amount of chunkData. Tables with >5MB of chunk data (copies) can be added to the new table using S3's multipart upload copy feature. Smaller tables with <5MB of chunk data (manuals) must be read, assembled into |buff|, and then re-uploaded in parts that are larger than 5MB. 253 copies, manuals, buff, err := dividePlan(ctx, plan, uint64(s3p.limits.partMin), uint64(s3p.limits.partMax)) 254 255 if err != nil { 256 return nil, err 257 } 258 259 ae := atomicerr.New() 260 // Concurrently read data from small tables into |buff| 261 var readWg sync.WaitGroup 262 for _, man := range manuals { 263 readWg.Add(1) 264 go func(m manualPart) { 265 defer readWg.Done() 266 err := m.run(ctx, buff) 267 if err != nil { 268 ae.SetIfError(fmt.Errorf("failed to read conjoin table data: %w", err)) 269 } 270 }(man) 271 } 272 readWg.Wait() 273 274 if err := ae.Get(); err != nil { 275 return nil, err 276 } 277 278 // sendPart calls |doUpload| to send part |partNum|, forwarding errors over |failed| or success over |sent|. Closing (or sending) on |done| will cancel all in-progress calls to sendPart. 279 sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{}) 280 var uploadWg sync.WaitGroup 281 type uploadFn func() (etag string, err error) 282 sendPart := func(partNum int64, doUpload uploadFn) { 283 if s3p.rl != nil { 284 s3p.rl <- struct{}{} 285 defer func() { <-s3p.rl }() 286 } 287 defer uploadWg.Done() 288 289 // Check if upload has been terminated 290 select { 291 case <-done: 292 return 293 default: 294 } 295 296 etag, err := doUpload() 297 if err != nil { 298 failed <- err 299 return 300 } 301 // Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly. 302 select { 303 case sent <- s3UploadedPart{int64(partNum), etag}: 304 case <-done: 305 return 306 } 307 } 308 309 // Concurrently begin sending all parts using sendPart(). 310 // First, kick off sending all the copyable parts. 311 partNum := int64(1) // Part numbers are 1-indexed 312 for _, cp := range copies { 313 uploadWg.Add(1) 314 go func(cp copyPart, partNum int64) { 315 sendPart(partNum, func() (etag string, err error) { 316 return s3p.uploadPartCopy(ctx, cp.name, cp.srcOffset, cp.srcLen, key, uploadID, partNum) 317 }) 318 }(cp, partNum) 319 partNum++ 320 } 321 322 // Then, split buff (data from |manuals| and index) into parts and upload those concurrently. 323 numManualParts := getNumParts(uint64(len(buff)), s3p.limits.partTarget) // TODO: What if this is too big? 324 for i := uint64(0); i < numManualParts; i++ { 325 start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget 326 if i+1 == numManualParts { // If this is the last part, make sure it includes any overflow 327 end = uint64(len(buff)) 328 } 329 uploadWg.Add(1) 330 go func(data []byte, partNum int64) { 331 sendPart(partNum, func() (etag string, err error) { 332 return s3p.uploadPart(ctx, data, key, uploadID, partNum) 333 }) 334 }(buff[start:end], partNum) 335 partNum++ 336 } 337 338 // When all the uploads started above are done, close |sent| and |failed| so that the code below will correctly detect that we're done sending parts and move forward. 339 go func() { 340 uploadWg.Wait() 341 close(sent) 342 close(failed) 343 }() 344 345 // Watch |sent| and |failed| for the results of part uploads. If ever one fails, close |done| to stop all the in-progress or pending sendPart() calls and then bail. 346 multipartUpload := &s3.CompletedMultipartUpload{} 347 var firstFailure error 348 for cont := true; cont; { 349 select { 350 case sentPart, open := <-sent: 351 if open { 352 multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{ 353 ETag: aws.String(sentPart.etag), 354 PartNumber: aws.Int64(sentPart.idx), 355 }) 356 } 357 cont = open 358 359 case err := <-failed: 360 if err != nil && firstFailure == nil { // nil err may happen when failed gets closed 361 firstFailure = err 362 close(done) 363 } 364 } 365 } 366 367 // If there was any failure detected above, |done| is already closed 368 if firstFailure == nil { 369 close(done) 370 } 371 sort.Sort(partsByPartNum(multipartUpload.Parts)) // S3 requires that these be in part-order 372 return multipartUpload, firstFailure 373 } 374 375 type copyPart struct { 376 name string 377 srcOffset, srcLen int64 378 } 379 380 type manualPart struct { 381 src chunkSource 382 start, end int64 383 } 384 385 func (mp manualPart) run(ctx context.Context, buff []byte) error { 386 reader, _, err := mp.src.reader(ctx) 387 if err != nil { 388 return err 389 } 390 defer reader.Close() 391 _, err = io.ReadFull(reader, buff[mp.start:mp.end]) 392 return err 393 } 394 395 // dividePlan assumes that plan.sources (which is of type chunkSourcesByDescendingDataSize) is correctly sorted by descending data size. 396 func dividePlan(ctx context.Context, plan compactionPlan, minPartSize, maxPartSize uint64) (copies []copyPart, manuals []manualPart, buff []byte, err error) { 397 // NB: if maxPartSize < 2*minPartSize, splitting large copies apart isn't solvable. S3's limits are plenty far enough apart that this isn't a problem in production, but we could violate this in tests. 398 if maxPartSize < 2*minPartSize { 399 return nil, nil, nil, errors.New("failed to split large copies apart") 400 } 401 402 buffSize := uint64(len(plan.mergedIndex)) 403 i := 0 404 for ; i < len(plan.sources.sws); i++ { 405 sws := plan.sources.sws[i] 406 if sws.dataLen < minPartSize { 407 // since plan.sources is sorted in descending chunk-data-length order, we know that sws and all members after it are too small to copy. 408 break 409 } 410 if sws.dataLen <= maxPartSize { 411 h := sws.source.hash() 412 copies = append(copies, copyPart{h.String(), 0, int64(sws.dataLen)}) 413 continue 414 } 415 416 // Now, we need to break the data into some number of parts such that for all parts minPartSize <= size(part) <= maxPartSize. This code tries to split the part evenly, such that all new parts satisfy the previous inequality. This gets tricky around edge cases. Consider min = 5b and max = 10b and a data length of 101b. You need to send 11 parts, but you can't just send 10 parts of 10 bytes and 1 part of 1 byte -- the last is too small. You also can't send 10 parts of 9 bytes each and 1 part of 11 bytes, because the last is too big. You have to distribute the extra bytes across all the parts so that all of them fall into the proper size range. 417 lens := splitOnMaxSize(sws.dataLen, maxPartSize) 418 419 var srcStart int64 420 for _, length := range lens { 421 h := sws.source.hash() 422 copies = append(copies, copyPart{h.String(), srcStart, length}) 423 srcStart += length 424 } 425 } 426 var offset int64 427 for ; i < len(plan.sources.sws); i++ { 428 sws := plan.sources.sws[i] 429 manuals = append(manuals, manualPart{sws.source, offset, offset + int64(sws.dataLen)}) 430 offset += int64(sws.dataLen) 431 buffSize += sws.dataLen 432 } 433 buff = make([]byte, buffSize) 434 copy(buff[buffSize-uint64(len(plan.mergedIndex)):], plan.mergedIndex) 435 return 436 } 437 438 // Splits |dataLen| into the maximum number of roughly-equal part sizes such that each is <= maxPartSize. 439 func splitOnMaxSize(dataLen, maxPartSize uint64) []int64 { 440 numParts := dataLen / maxPartSize 441 if dataLen%maxPartSize > 0 { 442 numParts++ 443 } 444 baseSize := int64(dataLen / numParts) 445 extraBytes := dataLen % numParts 446 sizes := make([]int64, numParts) 447 for i := range sizes { 448 sizes[i] = baseSize 449 if extraBytes > 0 { 450 sizes[i]++ 451 extraBytes-- 452 } 453 } 454 return sizes 455 } 456 457 func (s3p awsTablePersister) uploadPartCopy(ctx context.Context, src string, srcStart, srcEnd int64, key, uploadID string, partNum int64) (etag string, err error) { 458 res, err := s3p.s3.UploadPartCopyWithContext(ctx, &s3.UploadPartCopyInput{ 459 CopySource: aws.String(url.PathEscape(s3p.bucket + "/" + s3p.key(src))), 460 CopySourceRange: aws.String(s3RangeHeader(srcStart, srcEnd)), 461 Bucket: aws.String(s3p.bucket), 462 Key: aws.String(s3p.key(key)), 463 PartNumber: aws.Int64(int64(partNum)), 464 UploadId: aws.String(uploadID), 465 }) 466 if err == nil { 467 etag = *res.CopyPartResult.ETag 468 } 469 return 470 } 471 472 func (s3p awsTablePersister) uploadPart(ctx context.Context, data []byte, key, uploadID string, partNum int64) (etag string, err error) { 473 res, err := s3p.s3.UploadPartWithContext(ctx, &s3.UploadPartInput{ 474 Bucket: aws.String(s3p.bucket), 475 Key: aws.String(s3p.key(key)), 476 PartNumber: aws.Int64(int64(partNum)), 477 UploadId: aws.String(uploadID), 478 Body: bytes.NewReader(data), 479 }) 480 if err == nil { 481 etag = *res.ETag 482 } 483 return 484 } 485 486 func (s3p awsTablePersister) PruneTableFiles(ctx context.Context, keeper func() []hash.Hash, t time.Time) error { 487 return chunks.ErrUnsupportedOperation 488 } 489 490 func (s3p awsTablePersister) Close() error { 491 return nil 492 }