github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/aws_table_persister.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "errors" 28 "io" 29 "net/url" 30 "sort" 31 "sync" 32 "time" 33 34 "github.com/aws/aws-sdk-go/aws" 35 "github.com/aws/aws-sdk-go/service/s3" 36 37 "github.com/dolthub/dolt/go/store/atomicerr" 38 "github.com/dolthub/dolt/go/store/chunks" 39 "github.com/dolthub/dolt/go/store/util/verbose" 40 ) 41 42 const ( 43 minS3PartSize = 5 * 1 << 20 // 5MiB 44 maxS3PartSize = 64 * 1 << 20 // 64MiB 45 maxS3Parts = 10000 46 47 // Disable persisting tables in DynamoDB. This is currently unused by 48 // Dolthub and keeping it requires provisioning DynamoDB throughout for 49 // the noop reads. 50 maxDynamoChunks = 0 51 maxDynamoItemSize = 0 52 53 defaultS3PartSize = minS3PartSize // smallest allowed by S3 allows for most throughput 54 ) 55 56 type awsTablePersister struct { 57 s3 s3svc 58 bucket string 59 rl chan struct{} 60 tc tableCache 61 ddb *ddbTableStore 62 limits awsLimits 63 indexCache *indexCache 64 ns string 65 parseIndex indexParserF 66 } 67 68 type awsLimits struct { 69 partTarget, partMin, partMax uint64 70 itemMax int 71 chunkMax uint32 72 } 73 74 func (al awsLimits) tableFitsInDynamo(name addr, dataLen int, chunkCount uint32) bool { 75 calcItemSize := func(n addr, dataLen int) int { 76 return len(dbAttr) + len(tablePrefix) + len(n.String()) + len(dataAttr) + dataLen 77 } 78 return chunkCount <= al.chunkMax && calcItemSize(name, dataLen) < al.itemMax 79 } 80 81 func (al awsLimits) tableMayBeInDynamo(chunkCount uint32) bool { 82 return chunkCount <= al.chunkMax 83 } 84 85 func (s3p awsTablePersister) Open(ctx context.Context, name addr, chunkCount uint32, stats *Stats) (chunkSource, error) { 86 return newAWSChunkSource( 87 ctx, 88 s3p.ddb, 89 &s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc, ns: s3p.ns}, 90 s3p.limits, 91 name, 92 chunkCount, 93 s3p.indexCache, 94 stats, 95 s3p.parseIndex, 96 ) 97 } 98 99 type s3UploadedPart struct { 100 idx int64 101 etag string 102 } 103 104 func (s3p awsTablePersister) key(k string) string { 105 if s3p.ns != "" { 106 return s3p.ns + "/" + k 107 } 108 return k 109 } 110 111 func (s3p awsTablePersister) Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) { 112 name, data, chunkCount, err := mt.write(haver, stats) 113 114 if err != nil { 115 return emptyChunkSource{}, err 116 } 117 118 if chunkCount == 0 { 119 return emptyChunkSource{}, nil 120 } 121 122 if s3p.limits.tableFitsInDynamo(name, len(data), chunkCount) { 123 err := s3p.ddb.Write(ctx, name, data) 124 125 if err != nil { 126 return nil, err 127 } 128 129 return newReaderFromIndexData(s3p.indexCache, data, name, &dynamoTableReaderAt{ddb: s3p.ddb, h: name}, s3BlockSize) 130 } 131 132 if s3p.tc != nil { 133 go func() { 134 // Ignore errors. Will be reloaded on read if needed, or error will occur at that time. 135 _ = s3p.tc.store(name, bytes.NewReader(data), uint64(len(data))) 136 }() 137 } 138 139 err = s3p.multipartUpload(ctx, data, name.String()) 140 141 if err != nil { 142 return emptyChunkSource{}, err 143 } 144 145 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc, ns: s3p.ns}, name} 146 return newReaderFromIndexData(s3p.indexCache, data, name, tra, s3BlockSize) 147 } 148 149 func (s3p awsTablePersister) multipartUpload(ctx context.Context, data []byte, key string) error { 150 uploadID, err := s3p.startMultipartUpload(ctx, key) 151 152 if err != nil { 153 return err 154 } 155 156 multipartUpload, err := s3p.uploadParts(ctx, data, key, uploadID) 157 if err != nil { 158 _ = s3p.abortMultipartUpload(ctx, key, uploadID) 159 return err 160 } 161 162 return s3p.completeMultipartUpload(ctx, key, uploadID, multipartUpload) 163 } 164 165 func (s3p awsTablePersister) startMultipartUpload(ctx context.Context, key string) (string, error) { 166 result, err := s3p.s3.CreateMultipartUploadWithContext(ctx, &s3.CreateMultipartUploadInput{ 167 Bucket: aws.String(s3p.bucket), 168 Key: aws.String(s3p.key(key)), 169 }) 170 171 if err != nil { 172 return "", err 173 } 174 175 return *result.UploadId, nil 176 } 177 178 func (s3p awsTablePersister) abortMultipartUpload(ctx context.Context, key, uploadID string) error { 179 _, abrtErr := s3p.s3.AbortMultipartUploadWithContext(ctx, &s3.AbortMultipartUploadInput{ 180 Bucket: aws.String(s3p.bucket), 181 Key: aws.String(s3p.key(key)), 182 UploadId: aws.String(uploadID), 183 }) 184 185 return abrtErr 186 } 187 188 func (s3p awsTablePersister) completeMultipartUpload(ctx context.Context, key, uploadID string, mpu *s3.CompletedMultipartUpload) error { 189 _, err := s3p.s3.CompleteMultipartUploadWithContext(ctx, &s3.CompleteMultipartUploadInput{ 190 Bucket: aws.String(s3p.bucket), 191 Key: aws.String(s3p.key(key)), 192 MultipartUpload: mpu, 193 UploadId: aws.String(uploadID), 194 }) 195 196 return err 197 } 198 199 func (s3p awsTablePersister) uploadParts(ctx context.Context, data []byte, key, uploadID string) (*s3.CompletedMultipartUpload, error) { 200 sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{}) 201 202 numParts := getNumParts(uint64(len(data)), s3p.limits.partTarget) 203 204 if numParts > maxS3Parts { 205 return nil, errors.New("exceeded maximum parts") 206 } 207 208 var wg sync.WaitGroup 209 sendPart := func(partNum, start, end uint64) { 210 if s3p.rl != nil { 211 s3p.rl <- struct{}{} 212 defer func() { <-s3p.rl }() 213 } 214 defer wg.Done() 215 216 // Check if upload has been terminated 217 select { 218 case <-done: 219 return 220 default: 221 } 222 // Upload the desired part 223 if partNum == numParts { // If this is the last part, make sure it includes any overflow 224 end = uint64(len(data)) 225 } 226 etag, err := s3p.uploadPart(ctx, data[start:end], key, uploadID, int64(partNum)) 227 if err != nil { 228 failed <- err 229 return 230 } 231 // Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly. 232 select { 233 case sent <- s3UploadedPart{int64(partNum), etag}: 234 case <-done: 235 return 236 } 237 } 238 for i := uint64(0); i < numParts; i++ { 239 wg.Add(1) 240 partNum := i + 1 // Parts are 1-indexed 241 start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget 242 go sendPart(partNum, start, end) 243 } 244 go func() { 245 wg.Wait() 246 close(sent) 247 close(failed) 248 }() 249 250 multipartUpload := &s3.CompletedMultipartUpload{} 251 var firstFailure error 252 for cont := true; cont; { 253 select { 254 case sentPart, open := <-sent: 255 if open { 256 multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{ 257 ETag: aws.String(sentPart.etag), 258 PartNumber: aws.Int64(sentPart.idx), 259 }) 260 } 261 cont = open 262 263 case err := <-failed: 264 if err != nil && firstFailure == nil { // nil err may happen when failed gets closed 265 firstFailure = err 266 close(done) 267 } 268 } 269 } 270 271 if firstFailure == nil { 272 close(done) 273 } 274 sort.Sort(partsByPartNum(multipartUpload.Parts)) 275 return multipartUpload, firstFailure 276 } 277 278 func getNumParts(dataLen, minPartSize uint64) uint64 { 279 numParts := dataLen / minPartSize 280 if numParts == 0 { 281 numParts = 1 282 } 283 return numParts 284 } 285 286 type partsByPartNum []*s3.CompletedPart 287 288 func (s partsByPartNum) Len() int { 289 return len(s) 290 } 291 292 func (s partsByPartNum) Less(i, j int) bool { 293 return *s[i].PartNumber < *s[j].PartNumber 294 } 295 296 func (s partsByPartNum) Swap(i, j int) { 297 s[i], s[j] = s[j], s[i] 298 } 299 300 func (s3p awsTablePersister) ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, error) { 301 plan, err := planConjoin(sources, stats) 302 303 if err != nil { 304 return nil, err 305 } 306 307 if plan.chunkCount == 0 { 308 return emptyChunkSource{}, nil 309 } 310 t1 := time.Now() 311 name := nameFromSuffixes(plan.suffixes()) 312 err = s3p.executeCompactionPlan(ctx, plan, name.String()) 313 314 if err != nil { 315 return nil, err 316 } 317 318 verbose.Logger(ctx).Sugar().Debugf("Compacted table of %d Kb in %s", plan.totalCompressedData/1024, time.Since(t1)) 319 320 if s3p.tc != nil { 321 go func() { 322 // load conjoined table to the cache. Ignore errors. Will be reloaded on read if needed, or error will occur 323 // at that time. 324 _ = s3p.loadIntoCache(ctx, name) 325 }() 326 } 327 328 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc, ns: s3p.ns}, name} 329 return newReaderFromIndexData(s3p.indexCache, plan.mergedIndex, name, tra, s3BlockSize) 330 } 331 332 func (s3p awsTablePersister) loadIntoCache(ctx context.Context, name addr) error { 333 input := &s3.GetObjectInput{ 334 Bucket: aws.String(s3p.bucket), 335 Key: aws.String(name.String()), 336 } 337 result, err := s3p.s3.GetObjectWithContext(ctx, input) 338 339 if err != nil { 340 return err 341 } 342 defer result.Body.Close() 343 344 return s3p.tc.store(name, result.Body, uint64(*result.ContentLength)) 345 } 346 347 func (s3p awsTablePersister) executeCompactionPlan(ctx context.Context, plan compactionPlan, key string) error { 348 uploadID, err := s3p.startMultipartUpload(ctx, key) 349 350 if err != nil { 351 return err 352 } 353 354 multipartUpload, err := s3p.assembleTable(ctx, plan, key, uploadID) 355 if err != nil { 356 _ = s3p.abortMultipartUpload(ctx, key, uploadID) 357 return err 358 } 359 360 return s3p.completeMultipartUpload(ctx, key, uploadID, multipartUpload) 361 } 362 363 func (s3p awsTablePersister) assembleTable(ctx context.Context, plan compactionPlan, key, uploadID string) (*s3.CompletedMultipartUpload, error) { 364 if len(plan.sources.sws) > maxS3Parts { 365 return nil, errors.New("exceeded maximum parts") 366 } 367 368 // Separate plan.sources by amount of chunkData. Tables with >5MB of chunk data (copies) can be added to the new table using S3's multipart upload copy feature. Smaller tables with <5MB of chunk data (manuals) must be read, assembled into |buff|, and then re-uploaded in parts that are larger than 5MB. 369 copies, manuals, buff, err := dividePlan(ctx, plan, uint64(s3p.limits.partMin), uint64(s3p.limits.partMax)) 370 371 if err != nil { 372 return nil, err 373 } 374 375 ae := atomicerr.New() 376 // Concurrently read data from small tables into |buff| 377 var readWg sync.WaitGroup 378 for _, man := range manuals { 379 readWg.Add(1) 380 go func(m manualPart) { 381 defer readWg.Done() 382 n, _ := m.srcR.Read(buff[m.dstStart:m.dstEnd]) 383 if int64(n) < m.dstEnd-m.dstStart { 384 ae.SetIfError(errors.New("failed to read all the table data")) 385 } 386 }(man) 387 } 388 readWg.Wait() 389 390 if err := ae.Get(); err != nil { 391 return nil, err 392 } 393 394 // sendPart calls |doUpload| to send part |partNum|, forwarding errors over |failed| or success over |sent|. Closing (or sending) on |done| will cancel all in-progress calls to sendPart. 395 sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{}) 396 var uploadWg sync.WaitGroup 397 type uploadFn func() (etag string, err error) 398 sendPart := func(partNum int64, doUpload uploadFn) { 399 if s3p.rl != nil { 400 s3p.rl <- struct{}{} 401 defer func() { <-s3p.rl }() 402 } 403 defer uploadWg.Done() 404 405 // Check if upload has been terminated 406 select { 407 case <-done: 408 return 409 default: 410 } 411 412 etag, err := doUpload() 413 if err != nil { 414 failed <- err 415 return 416 } 417 // Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly. 418 select { 419 case sent <- s3UploadedPart{int64(partNum), etag}: 420 case <-done: 421 return 422 } 423 } 424 425 // Concurrently begin sending all parts using sendPart(). 426 // First, kick off sending all the copyable parts. 427 partNum := int64(1) // Part numbers are 1-indexed 428 for _, cp := range copies { 429 uploadWg.Add(1) 430 go func(cp copyPart, partNum int64) { 431 sendPart(partNum, func() (etag string, err error) { 432 return s3p.uploadPartCopy(ctx, cp.name, cp.srcOffset, cp.srcLen, key, uploadID, partNum) 433 }) 434 }(cp, partNum) 435 partNum++ 436 } 437 438 // Then, split buff (data from |manuals| and index) into parts and upload those concurrently. 439 numManualParts := getNumParts(uint64(len(buff)), s3p.limits.partTarget) // TODO: What if this is too big? 440 for i := uint64(0); i < numManualParts; i++ { 441 start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget 442 if i+1 == numManualParts { // If this is the last part, make sure it includes any overflow 443 end = uint64(len(buff)) 444 } 445 uploadWg.Add(1) 446 go func(data []byte, partNum int64) { 447 sendPart(partNum, func() (etag string, err error) { 448 return s3p.uploadPart(ctx, data, key, uploadID, partNum) 449 }) 450 }(buff[start:end], partNum) 451 partNum++ 452 } 453 454 // When all the uploads started above are done, close |sent| and |failed| so that the code below will correctly detect that we're done sending parts and move forward. 455 go func() { 456 uploadWg.Wait() 457 close(sent) 458 close(failed) 459 }() 460 461 // Watch |sent| and |failed| for the results of part uploads. If ever one fails, close |done| to stop all the in-progress or pending sendPart() calls and then bail. 462 multipartUpload := &s3.CompletedMultipartUpload{} 463 var firstFailure error 464 for cont := true; cont; { 465 select { 466 case sentPart, open := <-sent: 467 if open { 468 multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{ 469 ETag: aws.String(sentPart.etag), 470 PartNumber: aws.Int64(sentPart.idx), 471 }) 472 } 473 cont = open 474 475 case err := <-failed: 476 if err != nil && firstFailure == nil { // nil err may happen when failed gets closed 477 firstFailure = err 478 close(done) 479 } 480 } 481 } 482 483 // If there was any failure detected above, |done| is already closed 484 if firstFailure == nil { 485 close(done) 486 } 487 sort.Sort(partsByPartNum(multipartUpload.Parts)) // S3 requires that these be in part-order 488 return multipartUpload, firstFailure 489 } 490 491 type copyPart struct { 492 name string 493 srcOffset, srcLen int64 494 } 495 496 type manualPart struct { 497 srcR io.Reader 498 dstStart, dstEnd int64 499 } 500 501 // dividePlan assumes that plan.sources (which is of type chunkSourcesByDescendingDataSize) is correctly sorted by descending data size. 502 func dividePlan(ctx context.Context, plan compactionPlan, minPartSize, maxPartSize uint64) (copies []copyPart, manuals []manualPart, buff []byte, err error) { 503 // NB: if maxPartSize < 2*minPartSize, splitting large copies apart isn't solvable. S3's limits are plenty far enough apart that this isn't a problem in production, but we could violate this in tests. 504 if maxPartSize < 2*minPartSize { 505 return nil, nil, nil, errors.New("failed to split large copies apart") 506 } 507 508 buffSize := uint64(len(plan.mergedIndex)) 509 i := 0 510 for ; i < len(plan.sources.sws); i++ { 511 sws := plan.sources.sws[i] 512 if sws.dataLen < minPartSize { 513 // since plan.sources is sorted in descending chunk-data-length order, we know that sws and all members after it are too small to copy. 514 break 515 } 516 if sws.dataLen <= maxPartSize { 517 h, err := sws.source.hash() 518 519 if err != nil { 520 return nil, nil, nil, err 521 } 522 523 copies = append(copies, copyPart{h.String(), 0, int64(sws.dataLen)}) 524 continue 525 } 526 527 // Now, we need to break the data into some number of parts such that for all parts minPartSize <= size(part) <= maxPartSize. This code tries to split the part evenly, such that all new parts satisfy the previous inequality. This gets tricky around edge cases. Consider min = 5b and max = 10b and a data length of 101b. You need to send 11 parts, but you can't just send 10 parts of 10 bytes and 1 part of 1 byte -- the last is too small. You also can't send 10 parts of 9 bytes each and 1 part of 11 bytes, because the last is too big. You have to distribute the extra bytes across all the parts so that all of them fall into the proper size range. 528 lens := splitOnMaxSize(sws.dataLen, maxPartSize) 529 530 var srcStart int64 531 for _, length := range lens { 532 h, err := sws.source.hash() 533 534 if err != nil { 535 return nil, nil, nil, err 536 } 537 538 copies = append(copies, copyPart{h.String(), srcStart, length}) 539 srcStart += length 540 } 541 } 542 var offset int64 543 for ; i < len(plan.sources.sws); i++ { 544 sws := plan.sources.sws[i] 545 rdr, err := sws.source.reader(ctx) 546 547 if err != nil { 548 return nil, nil, nil, err 549 } 550 551 manuals = append(manuals, manualPart{rdr, offset, offset + int64(sws.dataLen)}) 552 offset += int64(sws.dataLen) 553 buffSize += sws.dataLen 554 } 555 buff = make([]byte, buffSize) 556 copy(buff[buffSize-uint64(len(plan.mergedIndex)):], plan.mergedIndex) 557 return 558 } 559 560 // Splits |dataLen| into the maximum number of roughly-equal part sizes such that each is <= maxPartSize. 561 func splitOnMaxSize(dataLen, maxPartSize uint64) []int64 { 562 numParts := dataLen / maxPartSize 563 if dataLen%maxPartSize > 0 { 564 numParts++ 565 } 566 baseSize := int64(dataLen / numParts) 567 extraBytes := dataLen % numParts 568 sizes := make([]int64, numParts) 569 for i := range sizes { 570 sizes[i] = baseSize 571 if extraBytes > 0 { 572 sizes[i]++ 573 extraBytes-- 574 } 575 } 576 return sizes 577 } 578 579 func (s3p awsTablePersister) uploadPartCopy(ctx context.Context, src string, srcStart, srcEnd int64, key, uploadID string, partNum int64) (etag string, err error) { 580 res, err := s3p.s3.UploadPartCopyWithContext(ctx, &s3.UploadPartCopyInput{ 581 CopySource: aws.String(url.PathEscape(s3p.bucket + "/" + s3p.key(src))), 582 CopySourceRange: aws.String(s3RangeHeader(srcStart, srcEnd)), 583 Bucket: aws.String(s3p.bucket), 584 Key: aws.String(s3p.key(key)), 585 PartNumber: aws.Int64(int64(partNum)), 586 UploadId: aws.String(uploadID), 587 }) 588 if err == nil { 589 etag = *res.CopyPartResult.ETag 590 } 591 return 592 } 593 594 func (s3p awsTablePersister) uploadPart(ctx context.Context, data []byte, key, uploadID string, partNum int64) (etag string, err error) { 595 res, err := s3p.s3.UploadPartWithContext(ctx, &s3.UploadPartInput{ 596 Bucket: aws.String(s3p.bucket), 597 Key: aws.String(s3p.key(key)), 598 PartNumber: aws.Int64(int64(partNum)), 599 UploadId: aws.String(uploadID), 600 Body: bytes.NewReader(data), 601 }) 602 if err == nil { 603 etag = *res.ETag 604 } 605 return 606 } 607 608 func (s3p awsTablePersister) PruneTableFiles(ctx context.Context, contents manifestContents) error { 609 return chunks.ErrUnsupportedOperation 610 }