github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/aws_table_persister.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package nbs 6 7 import ( 8 "bytes" 9 "io" 10 "net/url" 11 "sort" 12 "sync" 13 "time" 14 15 "github.com/attic-labs/noms/go/d" 16 "github.com/attic-labs/noms/go/util/verbose" 17 "github.com/aws/aws-sdk-go/aws" 18 "github.com/aws/aws-sdk-go/service/s3" 19 ) 20 21 const ( 22 minS3PartSize = 5 * 1 << 20 // 5MiB 23 maxS3PartSize = 64 * 1 << 20 // 64MiB 24 maxS3Parts = 10000 25 26 maxDynamoChunks = 64 27 maxDynamoItemSize = 400 * (1 << 10) // 400k 28 29 defaultS3PartSize = minS3PartSize // smallest allowed by S3 allows for most throughput 30 ) 31 32 type awsTablePersister struct { 33 s3 s3svc 34 bucket string 35 rl chan struct{} 36 tc tableCache 37 ddb *ddbTableStore 38 limits awsLimits 39 indexCache *indexCache 40 } 41 42 type awsLimits struct { 43 partTarget, partMin, partMax uint64 44 itemMax int 45 chunkMax uint32 46 } 47 48 func (al awsLimits) tableFitsInDynamo(name addr, dataLen int, chunkCount uint32) bool { 49 calcItemSize := func(n addr, dataLen int) int { 50 return len(dbAttr) + len(tablePrefix) + len(n.String()) + len(dataAttr) + dataLen 51 } 52 return chunkCount <= al.chunkMax && calcItemSize(name, dataLen) < al.itemMax 53 } 54 55 func (al awsLimits) tableMayBeInDynamo(chunkCount uint32) bool { 56 return chunkCount <= al.chunkMax 57 } 58 59 func (s3p awsTablePersister) Open(name addr, chunkCount uint32, stats *Stats) chunkSource { 60 return newAWSChunkSource( 61 s3p.ddb, 62 &s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc}, 63 s3p.limits, 64 name, 65 chunkCount, 66 s3p.indexCache, 67 stats, 68 ) 69 } 70 71 type s3UploadedPart struct { 72 idx int64 73 etag string 74 } 75 76 func (s3p awsTablePersister) Persist(mt *memTable, haver chunkReader, stats *Stats) chunkSource { 77 name, data, chunkCount := mt.write(haver, stats) 78 if chunkCount == 0 { 79 return emptyChunkSource{} 80 } 81 if s3p.limits.tableFitsInDynamo(name, len(data), chunkCount) { 82 s3p.ddb.Write(name, data) 83 return s3p.newReaderFromIndexData(data, name, &dynamoTableReaderAt{ddb: s3p.ddb, h: name}) 84 } 85 86 if s3p.tc != nil { 87 go s3p.tc.store(name, bytes.NewReader(data), uint64(len(data))) 88 } 89 s3p.multipartUpload(data, name.String()) 90 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc}, name} 91 return s3p.newReaderFromIndexData(data, name, tra) 92 } 93 94 func (s3p awsTablePersister) newReaderFromIndexData(idxData []byte, name addr, tra tableReaderAt) chunkSource { 95 index := parseTableIndex(idxData) 96 if s3p.indexCache != nil { 97 s3p.indexCache.lockEntry(name) 98 defer s3p.indexCache.unlockEntry(name) 99 s3p.indexCache.put(name, index) 100 } 101 return &awsChunkSource{newTableReader(index, tra, s3BlockSize), name} 102 } 103 104 func (s3p awsTablePersister) multipartUpload(data []byte, key string) { 105 uploadID := s3p.startMultipartUpload(key) 106 multipartUpload, err := s3p.uploadParts(data, key, uploadID) 107 if err != nil { 108 s3p.abortMultipartUpload(key, uploadID) 109 d.PanicIfError(err) // TODO: Better error handling here 110 } 111 s3p.completeMultipartUpload(key, uploadID, multipartUpload) 112 } 113 114 func (s3p awsTablePersister) startMultipartUpload(key string) string { 115 result, err := s3p.s3.CreateMultipartUpload(&s3.CreateMultipartUploadInput{ 116 Bucket: aws.String(s3p.bucket), 117 Key: aws.String(key), 118 }) 119 d.PanicIfError(err) 120 return *result.UploadId 121 } 122 123 func (s3p awsTablePersister) abortMultipartUpload(key, uploadID string) { 124 _, abrtErr := s3p.s3.AbortMultipartUpload(&s3.AbortMultipartUploadInput{ 125 Bucket: aws.String(s3p.bucket), 126 Key: aws.String(key), 127 UploadId: aws.String(uploadID), 128 }) 129 d.PanicIfError(abrtErr) 130 } 131 132 func (s3p awsTablePersister) completeMultipartUpload(key, uploadID string, mpu *s3.CompletedMultipartUpload) { 133 _, err := s3p.s3.CompleteMultipartUpload(&s3.CompleteMultipartUploadInput{ 134 Bucket: aws.String(s3p.bucket), 135 Key: aws.String(key), 136 MultipartUpload: mpu, 137 UploadId: aws.String(uploadID), 138 }) 139 d.PanicIfError(err) 140 } 141 142 func (s3p awsTablePersister) uploadParts(data []byte, key, uploadID string) (*s3.CompletedMultipartUpload, error) { 143 sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{}) 144 145 numParts := getNumParts(uint64(len(data)), s3p.limits.partTarget) 146 d.PanicIfTrue(numParts > maxS3Parts) // TODO: BUG 3433: handle > 10k parts 147 var wg sync.WaitGroup 148 sendPart := func(partNum, start, end uint64) { 149 if s3p.rl != nil { 150 s3p.rl <- struct{}{} 151 defer func() { <-s3p.rl }() 152 } 153 defer wg.Done() 154 155 // Check if upload has been terminated 156 select { 157 case <-done: 158 return 159 default: 160 } 161 // Upload the desired part 162 if partNum == numParts { // If this is the last part, make sure it includes any overflow 163 end = uint64(len(data)) 164 } 165 etag, err := s3p.uploadPart(data[start:end], key, uploadID, int64(partNum)) 166 if err != nil { 167 failed <- err 168 return 169 } 170 // Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly. 171 select { 172 case sent <- s3UploadedPart{int64(partNum), etag}: 173 case <-done: 174 return 175 } 176 } 177 for i := uint64(0); i < numParts; i++ { 178 wg.Add(1) 179 partNum := i + 1 // Parts are 1-indexed 180 start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget 181 go sendPart(partNum, start, end) 182 } 183 go func() { 184 wg.Wait() 185 close(sent) 186 close(failed) 187 }() 188 189 multipartUpload := &s3.CompletedMultipartUpload{} 190 var firstFailure error 191 for cont := true; cont; { 192 select { 193 case sentPart, open := <-sent: 194 if open { 195 multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{ 196 ETag: aws.String(sentPart.etag), 197 PartNumber: aws.Int64(sentPart.idx), 198 }) 199 } 200 cont = open 201 202 case err := <-failed: 203 if err != nil && firstFailure == nil { // nil err may happen when failed gets closed 204 firstFailure = err 205 close(done) 206 } 207 } 208 } 209 210 if firstFailure == nil { 211 close(done) 212 } 213 sort.Sort(partsByPartNum(multipartUpload.Parts)) 214 return multipartUpload, firstFailure 215 } 216 217 func getNumParts(dataLen, minPartSize uint64) uint64 { 218 numParts := dataLen / minPartSize 219 if numParts == 0 { 220 numParts = 1 221 } 222 return numParts 223 } 224 225 type partsByPartNum []*s3.CompletedPart 226 227 func (s partsByPartNum) Len() int { 228 return len(s) 229 } 230 231 func (s partsByPartNum) Less(i, j int) bool { 232 return *s[i].PartNumber < *s[j].PartNumber 233 } 234 235 func (s partsByPartNum) Swap(i, j int) { 236 s[i], s[j] = s[j], s[i] 237 } 238 239 func (s3p awsTablePersister) ConjoinAll(sources chunkSources, stats *Stats) chunkSource { 240 plan := planConjoin(sources, stats) 241 if plan.chunkCount == 0 { 242 return emptyChunkSource{} 243 } 244 t1 := time.Now() 245 name := nameFromSuffixes(plan.suffixes()) 246 s3p.executeCompactionPlan(plan, name.String()) 247 verbose.Log("Compacted table of %d Kb in %s", plan.totalCompressedData/1024, time.Since(t1)) 248 249 if s3p.tc != nil { 250 go s3p.loadIntoCache(name) // load conjoined table to the cache 251 } 252 tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, tc: s3p.tc}, name} 253 return s3p.newReaderFromIndexData(plan.mergedIndex, name, tra) 254 } 255 256 func (s3p awsTablePersister) loadIntoCache(name addr) { 257 input := &s3.GetObjectInput{ 258 Bucket: aws.String(s3p.bucket), 259 Key: aws.String(name.String()), 260 } 261 result, err := s3p.s3.GetObject(input) 262 d.PanicIfError(err) 263 264 s3p.tc.store(name, result.Body, uint64(*result.ContentLength)) 265 } 266 267 func (s3p awsTablePersister) executeCompactionPlan(plan compactionPlan, key string) { 268 uploadID := s3p.startMultipartUpload(key) 269 multipartUpload, err := s3p.assembleTable(plan, key, uploadID) 270 if err != nil { 271 s3p.abortMultipartUpload(key, uploadID) 272 d.PanicIfError(err) // TODO: Better error handling here 273 } 274 s3p.completeMultipartUpload(key, uploadID, multipartUpload) 275 } 276 277 func (s3p awsTablePersister) assembleTable(plan compactionPlan, key, uploadID string) (*s3.CompletedMultipartUpload, error) { 278 d.PanicIfTrue(len(plan.sources) > maxS3Parts) // TODO: BUG 3433: handle > 10k parts 279 280 // Separate plan.sources by amount of chunkData. Tables with >5MB of chunk data (copies) can be added to the new table using S3's multipart upload copy feature. Smaller tables with <5MB of chunk data (manuals) must be read, assembled into |buff|, and then re-uploaded in parts that are larger than 5MB. 281 copies, manuals, buff := dividePlan(plan, uint64(s3p.limits.partMin), uint64(s3p.limits.partMax)) 282 283 // Concurrently read data from small tables into |buff| 284 var readWg sync.WaitGroup 285 for _, man := range manuals { 286 readWg.Add(1) 287 go func(m manualPart) { 288 defer readWg.Done() 289 n, _ := m.srcR.Read(buff[m.dstStart:m.dstEnd]) 290 d.PanicIfTrue(int64(n) < m.dstEnd-m.dstStart) 291 }(man) 292 } 293 readWg.Wait() 294 295 // sendPart calls |doUpload| to send part |partNum|, forwarding errors over |failed| or success over |sent|. Closing (or sending) on |done| will cancel all in-progress calls to sendPart. 296 sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{}) 297 var uploadWg sync.WaitGroup 298 type uploadFn func() (etag string, err error) 299 sendPart := func(partNum int64, doUpload uploadFn) { 300 if s3p.rl != nil { 301 s3p.rl <- struct{}{} 302 defer func() { <-s3p.rl }() 303 } 304 defer uploadWg.Done() 305 306 // Check if upload has been terminated 307 select { 308 case <-done: 309 return 310 default: 311 } 312 313 etag, err := doUpload() 314 if err != nil { 315 failed <- err 316 return 317 } 318 // Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly. 319 select { 320 case sent <- s3UploadedPart{int64(partNum), etag}: 321 case <-done: 322 return 323 } 324 } 325 326 // Concurrently begin sending all parts using sendPart(). 327 // First, kick off sending all the copyable parts. 328 partNum := int64(1) // Part numbers are 1-indexed 329 for _, cp := range copies { 330 uploadWg.Add(1) 331 go func(cp copyPart, partNum int64) { 332 sendPart(partNum, func() (etag string, err error) { 333 return s3p.uploadPartCopy(cp.name, cp.srcOffset, cp.srcLen, key, uploadID, partNum) 334 }) 335 }(cp, partNum) 336 partNum++ 337 } 338 339 // Then, split buff (data from |manuals| and index) into parts and upload those concurrently. 340 numManualParts := getNumParts(uint64(len(buff)), s3p.limits.partTarget) // TODO: What if this is too big? 341 for i := uint64(0); i < numManualParts; i++ { 342 start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget 343 if i+1 == numManualParts { // If this is the last part, make sure it includes any overflow 344 end = uint64(len(buff)) 345 } 346 uploadWg.Add(1) 347 go func(data []byte, partNum int64) { 348 sendPart(partNum, func() (etag string, err error) { 349 return s3p.uploadPart(data, key, uploadID, partNum) 350 }) 351 }(buff[start:end], partNum) 352 partNum++ 353 } 354 355 // When all the uploads started above are done, close |sent| and |failed| so that the code below will correctly detect that we're done sending parts and move forward. 356 go func() { 357 uploadWg.Wait() 358 close(sent) 359 close(failed) 360 }() 361 362 // Watch |sent| and |failed| for the results of part uploads. If ever one fails, close |done| to stop all the in-progress or pending sendPart() calls and then bail. 363 multipartUpload := &s3.CompletedMultipartUpload{} 364 var firstFailure error 365 for cont := true; cont; { 366 select { 367 case sentPart, open := <-sent: 368 if open { 369 multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{ 370 ETag: aws.String(sentPart.etag), 371 PartNumber: aws.Int64(sentPart.idx), 372 }) 373 } 374 cont = open 375 376 case err := <-failed: 377 if err != nil && firstFailure == nil { // nil err may happen when failed gets closed 378 firstFailure = err 379 close(done) 380 } 381 } 382 } 383 384 // If there was any failure detected above, |done| is already closed 385 if firstFailure == nil { 386 close(done) 387 } 388 sort.Sort(partsByPartNum(multipartUpload.Parts)) // S3 requires that these be in part-order 389 return multipartUpload, firstFailure 390 } 391 392 type copyPart struct { 393 name string 394 srcOffset, srcLen int64 395 } 396 397 type manualPart struct { 398 srcR io.Reader 399 dstStart, dstEnd int64 400 } 401 402 // dividePlan assumes that plan.sources (which is of type chunkSourcesByDescendingDataSize) is correctly sorted by descending data size. 403 func dividePlan(plan compactionPlan, minPartSize, maxPartSize uint64) (copies []copyPart, manuals []manualPart, buff []byte) { 404 // NB: if maxPartSize < 2*minPartSize, splitting large copies apart isn't solvable. S3's limits are plenty far enough apart that this isn't a problem in production, but we could violate this in tests. 405 d.PanicIfTrue(maxPartSize < 2*minPartSize) 406 407 buffSize := uint64(len(plan.mergedIndex)) 408 i := 0 409 for ; i < len(plan.sources); i++ { 410 sws := plan.sources[i] 411 if sws.dataLen < minPartSize { 412 // since plan.sources is sorted in descending chunk-data-length order, we know that sws and all members after it are too small to copy. 413 break 414 } 415 if sws.dataLen <= maxPartSize { 416 copies = append(copies, copyPart{sws.source.hash().String(), 0, int64(sws.dataLen)}) 417 continue 418 } 419 420 // Now, we need to break the data into some number of parts such that for all parts minPartSize <= size(part) <= maxPartSize. This code tries to split the part evenly, such that all new parts satisfy the previous inequality. This gets tricky around edge cases. Consider min = 5b and max = 10b and a data length of 101b. You need to send 11 parts, but you can't just send 10 parts of 10 bytes and 1 part of 1 byte -- the last is too small. You also can't send 10 parts of 9 bytes each and 1 part of 11 bytes, because the last is too big. You have to distribute the extra bytes across all the parts so that all of them fall into the proper size range. 421 lens := splitOnMaxSize(sws.dataLen, maxPartSize) 422 423 var srcStart int64 424 for _, length := range lens { 425 copies = append(copies, copyPart{sws.source.hash().String(), srcStart, length}) 426 srcStart += length 427 } 428 } 429 var offset int64 430 for ; i < len(plan.sources); i++ { 431 sws := plan.sources[i] 432 manuals = append(manuals, manualPart{sws.source.reader(), offset, offset + int64(sws.dataLen)}) 433 offset += int64(sws.dataLen) 434 buffSize += sws.dataLen 435 } 436 buff = make([]byte, buffSize) 437 copy(buff[buffSize-uint64(len(plan.mergedIndex)):], plan.mergedIndex) 438 return 439 } 440 441 // Splits |dataLen| into the maximum number of roughly-equal part sizes such that each is <= maxPartSize. 442 func splitOnMaxSize(dataLen, maxPartSize uint64) []int64 { 443 numParts := dataLen / maxPartSize 444 if dataLen%maxPartSize > 0 { 445 numParts++ 446 } 447 baseSize := int64(dataLen / numParts) 448 extraBytes := dataLen % numParts 449 sizes := make([]int64, numParts) 450 for i := range sizes { 451 sizes[i] = baseSize 452 if extraBytes > 0 { 453 sizes[i]++ 454 extraBytes-- 455 } 456 } 457 return sizes 458 } 459 460 func (s3p awsTablePersister) uploadPartCopy(src string, srcStart, srcEnd int64, key, uploadID string, partNum int64) (etag string, err error) { 461 res, err := s3p.s3.UploadPartCopy(&s3.UploadPartCopyInput{ 462 // TODO: Use url.PathEscape() once we're on go 1.8 463 CopySource: aws.String(url.QueryEscape(s3p.bucket + "/" + src)), 464 CopySourceRange: aws.String(s3RangeHeader(srcStart, srcEnd)), 465 Bucket: aws.String(s3p.bucket), 466 Key: aws.String(key), 467 PartNumber: aws.Int64(int64(partNum)), 468 UploadId: aws.String(uploadID), 469 }) 470 if err == nil { 471 etag = *res.CopyPartResult.ETag 472 } 473 return 474 } 475 476 func (s3p awsTablePersister) uploadPart(data []byte, key, uploadID string, partNum int64) (etag string, err error) { 477 res, err := s3p.s3.UploadPart(&s3.UploadPartInput{ 478 Bucket: aws.String(s3p.bucket), 479 Key: aws.String(key), 480 PartNumber: aws.Int64(int64(partNum)), 481 UploadId: aws.String(uploadID), 482 Body: bytes.NewReader(data), 483 }) 484 if err == nil { 485 etag = *res.ETag 486 } 487 return 488 }