github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/types/blob.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package types 23 24 import ( 25 "context" 26 "errors" 27 "io" 28 "runtime" 29 "sync" 30 31 "github.com/dolthub/dolt/go/store/atomicerr" 32 33 "github.com/dolthub/dolt/go/store/d" 34 ) 35 36 // Blob represents a list of Blobs. 37 type Blob struct { 38 sequence 39 } 40 41 func newBlob(seq sequence) Blob { 42 return Blob{seq} 43 } 44 45 func NewEmptyBlob(vrw ValueReadWriter) (Blob, error) { 46 seq, err := newBlobLeafSequence(vrw, []byte{}) 47 48 if err != nil { 49 return Blob{}, err 50 } 51 52 return Blob{seq}, nil 53 } 54 55 // ReadAt implements the ReaderAt interface. Eagerly loads requested byte-range from the blob p-tree. 56 func (b Blob) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) { 57 // TODO: Support negative off? 58 d.PanicIfTrue(off < 0) 59 60 startIdx := uint64(off) 61 if startIdx >= b.Len() { 62 return 0, io.EOF 63 } 64 65 endIdx := startIdx + uint64(len(p)) 66 if endIdx > b.Len() { 67 endIdx = b.Len() 68 } 69 70 var isEOF bool 71 if endIdx == b.Len() { 72 isEOF = true 73 } 74 75 if startIdx == endIdx { 76 return 77 } 78 79 leaves, localStart, err := LoadLeafNodes(ctx, []Collection{b}, startIdx, endIdx) 80 81 if err != nil { 82 return 0, err 83 } 84 85 endIdx = localStart + endIdx - startIdx 86 startIdx = localStart 87 88 for _, leaf := range leaves { 89 bl := leaf.asSequence().(blobLeafSequence) 90 91 localEnd := endIdx 92 data := bl.data() 93 leafLength := uint64(len(data)) 94 if localEnd > leafLength { 95 localEnd = leafLength 96 } 97 src := data[startIdx:localEnd] 98 99 copy(p[n:], src) 100 n += len(src) 101 endIdx -= localEnd 102 startIdx = 0 103 } 104 105 if isEOF { 106 err = io.EOF 107 } 108 109 return n, err 110 } 111 112 func (b Blob) Reader(ctx context.Context) *BlobReader { 113 return &BlobReader{b, 0, ctx} 114 } 115 116 func (b Blob) Copy(ctx context.Context, w io.Writer) (int64, error) { 117 return b.CopyReadAhead(ctx, w, 1<<23 /* 8MB */, 6) 118 } 119 120 // CopyReadAhead copies the entire contents of |b| to |w|, and attempts to stay 121 // |concurrency| |chunkSize| blocks of bytes ahead of the last byte written to 122 // |w|. 123 func (b Blob) CopyReadAhead(ctx context.Context, w io.Writer, chunkSize uint64, concurrency int) (int64, error) { 124 ae := atomicerr.New() 125 bChan := make(chan chan []byte, concurrency) 126 127 go func() { 128 defer close(bChan) 129 for idx, l := uint64(0), b.Len(); idx < l; { 130 if ae.IsSet() { 131 break 132 } 133 134 bc := make(chan []byte) 135 bChan <- bc 136 137 start := idx 138 blockLength := b.Len() - start 139 if blockLength > chunkSize { 140 blockLength = chunkSize 141 } 142 idx += blockLength 143 144 go func() { 145 defer close(bc) 146 buff := make([]byte, blockLength) 147 n, err := b.ReadAt(ctx, buff, int64(start)) 148 149 if err != nil && err != io.EOF { 150 ae.SetIfError(err) 151 } else if n > 0 { 152 bc <- buff 153 } 154 }() 155 } 156 }() 157 158 // Ensure read-ahead goroutines can exit 159 defer func() { 160 for range bChan { 161 } 162 }() 163 164 var n int64 165 for b := range bChan { 166 if ae.IsSet() { 167 break 168 } 169 170 bytes, ok := <-b 171 172 if !ok { 173 continue 174 } 175 176 ln, err := w.Write(bytes) 177 n += int64(ln) 178 if err != nil { 179 ae.SetIfError(err) 180 } 181 } 182 183 return n, ae.Get() 184 } 185 186 // Concat returns a new Blob comprised of this joined with other. It only needs 187 // to visit the rightmost prolly tree chunks of this Blob, and the leftmost 188 // prolly tree chunks of other, so it's efficient. 189 func (b Blob) Concat(ctx context.Context, other Blob) (Blob, error) { 190 seq, err := concat(ctx, b.sequence, other.sequence, func(cur *sequenceCursor, vrw ValueReadWriter) (*sequenceChunker, error) { 191 return b.newChunker(ctx, cur, vrw) 192 }) 193 194 if err != nil { 195 return Blob{}, err 196 } 197 198 return newBlob(seq), nil 199 } 200 201 func (b Blob) newChunker(ctx context.Context, cur *sequenceCursor, vrw ValueReadWriter) (*sequenceChunker, error) { 202 return newSequenceChunker(ctx, cur, 0, vrw, makeBlobLeafChunkFn(vrw), newIndexedMetaSequenceChunkFn(BlobKind, vrw), hashValueByte) 203 } 204 205 func (b Blob) asSequence() sequence { 206 return b.sequence 207 } 208 209 // Value interface 210 func (b Blob) Value(ctx context.Context) (Value, error) { 211 return b, nil 212 } 213 214 func (b Blob) isPrimitive() bool { 215 return true 216 } 217 218 func (b Blob) Kind() NomsKind { 219 if b.sequence == nil { 220 return BlobKind 221 } 222 return b.sequence.Kind() 223 } 224 225 func (b Blob) WalkValues(ctx context.Context, cb ValueCallback) error { 226 return nil 227 } 228 229 type BlobReader struct { 230 b Blob 231 pos int64 232 ctx context.Context 233 } 234 235 func (cbr *BlobReader) Read(p []byte) (n int, err error) { 236 n, err = cbr.b.ReadAt(cbr.ctx, p, cbr.pos) 237 cbr.pos += int64(n) 238 return 239 } 240 241 func (cbr *BlobReader) Seek(offset int64, whence int) (int64, error) { 242 abs := int64(cbr.pos) 243 244 switch whence { 245 case 0: 246 abs = offset 247 case 1: 248 abs += offset 249 case 2: 250 abs = int64(cbr.b.Len()) + offset 251 default: 252 return 0, errors.New("Blob.Reader.Seek: invalid whence") 253 } 254 255 if abs < 0 { 256 return 0, errors.New("Blob.Reader.Seek: negative position") 257 } 258 259 cbr.pos = int64(abs) 260 return abs, nil 261 } 262 263 func makeBlobLeafChunkFn(vrw ValueReadWriter) makeChunkFn { 264 return func(level uint64, items []sequenceItem) (Collection, orderedKey, uint64, error) { 265 d.PanicIfFalse(level == 0) 266 buff := make([]byte, len(items)) 267 268 for i, v := range items { 269 buff[i] = v.(byte) 270 } 271 272 return chunkBlobLeaf(vrw, buff) 273 } 274 } 275 276 func chunkBlobLeaf(vrw ValueReadWriter, buff []byte) (Collection, orderedKey, uint64, error) { 277 seq, err := newBlobLeafSequence(vrw, buff) 278 279 if err != nil { 280 return nil, orderedKey{}, 0, err 281 } 282 283 blob := newBlob(seq) 284 285 ordKey, err := orderedKeyFromInt(len(buff), vrw.Format()) 286 287 if err != nil { 288 return nil, orderedKey{}, 0, err 289 } 290 291 return blob, ordKey, uint64(len(buff)), nil 292 } 293 294 // NewBlob creates a Blob by reading from every Reader in rs and 295 // concatenating the result. NewBlob uses one goroutine per Reader. 296 func NewBlob(ctx context.Context, vrw ValueReadWriter, rs ...io.Reader) (Blob, error) { 297 return readBlobsP(ctx, vrw, rs...) 298 } 299 300 func readBlobsP(ctx context.Context, vrw ValueReadWriter, rs ...io.Reader) (Blob, error) { 301 switch len(rs) { 302 case 0: 303 return NewEmptyBlob(vrw) 304 case 1: 305 return readBlob(ctx, rs[0], vrw) 306 } 307 308 blobs := make([]Blob, len(rs)) 309 310 ae := atomicerr.New() 311 wg := &sync.WaitGroup{} 312 wg.Add(len(rs)) 313 314 for i, r := range rs { 315 if ae.IsSet() { 316 break 317 } 318 319 i2, r2 := i, r 320 go func() { 321 defer wg.Done() 322 323 if !ae.IsSet() { 324 var err error 325 blobs[i2], err = readBlob(ctx, r2, vrw) 326 ae.SetIfError(err) 327 } 328 }() 329 } 330 331 wg.Wait() 332 333 if ae.IsSet() { 334 return Blob{}, ae.Get() 335 } 336 337 b := blobs[0] 338 for i := 1; i < len(blobs); i++ { 339 var err error 340 b, err = b.Concat(ctx, blobs[i]) 341 342 if err != nil { 343 return Blob{}, err 344 } 345 } 346 return b, nil 347 } 348 349 func readBlob(ctx context.Context, r io.Reader, vrw ValueReadWriter) (Blob, error) { 350 sc, err := newEmptySequenceChunker(ctx, vrw, makeBlobLeafChunkFn(vrw), newIndexedMetaSequenceChunkFn(BlobKind, vrw), func(item sequenceItem, rv *rollingValueHasher) error { 351 rv.HashByte(item.(byte)) 352 return nil 353 }) 354 355 if err != nil { 356 return Blob{}, err 357 } 358 359 // TODO: The code below is temporary. It's basically a custom leaf-level chunker for blobs. There are substational 360 // perf gains by doing it this way as it avoids the cost of boxing every single byte which is chunked. 361 chunkBuff := [8192]byte{} 362 chunkBytes := chunkBuff[:] 363 rv := newRollingValueHasher(vrw.Format(), 0) 364 offset := 0 365 addByte := func(b byte) bool { 366 if offset >= len(chunkBytes) { 367 tmp := make([]byte, len(chunkBytes)*2) 368 copy(tmp, chunkBytes) 369 chunkBytes = tmp 370 } 371 chunkBytes[offset] = b 372 offset++ 373 rv.hashByte(b, uint32(offset)) 374 return rv.crossedBoundary 375 } 376 377 ae := atomicerr.New() 378 mtChan := make(chan chan metaTuple, runtime.NumCPU()) 379 380 makeChunk := func() { 381 rv.Reset() 382 cp := make([]byte, offset) 383 copy(cp, chunkBytes[0:offset]) 384 385 ch := make(chan metaTuple) 386 mtChan <- ch 387 388 go func(ch chan metaTuple, cp []byte) { 389 defer close(ch) 390 391 col, key, numLeaves, err := chunkBlobLeaf(vrw, cp) 392 393 if err != nil { 394 ae.SetIfError(err) 395 return 396 } 397 398 val, err := vrw.WriteValue(ctx, col) 399 400 if ae.SetIfError(err) { 401 return 402 } 403 404 mt, err := newMetaTuple(val, key, numLeaves) 405 406 if ae.SetIfError(err) { 407 return 408 } 409 410 ch <- mt 411 }(ch, cp) 412 413 offset = 0 414 } 415 416 go func() { 417 defer close(mtChan) 418 readBuff := [8192]byte{} 419 for { 420 if ae.IsSet() { 421 break 422 } 423 424 n, err := r.Read(readBuff[:]) 425 426 isEOF := err == io.EOF 427 if err != nil && err != io.EOF { 428 ae.SetIfError(err) 429 break 430 } 431 432 for i := 0; i < n; i++ { 433 if addByte(readBuff[i]) { 434 makeChunk() 435 } 436 } 437 438 if isEOF { 439 if offset > 0 { 440 makeChunk() 441 } 442 break 443 } 444 } 445 }() 446 447 for ch := range mtChan { 448 if ae.IsSet() { 449 break 450 } 451 452 mt, ok := <-ch 453 454 if !ok { 455 continue 456 } 457 458 if sc.parent == nil { 459 err := sc.createParent(ctx) 460 461 if ae.SetIfError(err) { 462 continue 463 } 464 } 465 466 _, err := sc.parent.Append(ctx, mt) 467 ae.SetIfError(err) 468 } 469 470 seq, err := sc.Done(ctx) 471 472 if err != nil { 473 return Blob{}, err 474 } 475 476 return newBlob(seq), nil 477 } 478 479 func (b Blob) readFrom(nbf *NomsBinFormat, bnr *binaryNomsReader) (Value, error) { 480 panic("unreachable") 481 } 482 483 func (b Blob) skip(nbf *NomsBinFormat, bnr *binaryNomsReader) { 484 panic("unreachable") 485 } 486 487 func (b Blob) String() string { 488 panic("unreachable") 489 } 490 491 func (b Blob) HumanReadableString() string { 492 panic("unreachable") 493 }