github.com/divan/go-ethereum@v1.8.14-0.20180820134928-1de9ada4016d/swarm/storage/chunker.go (about) 1 // Copyright 2016 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 package storage 17 18 import ( 19 "context" 20 "encoding/binary" 21 "errors" 22 "fmt" 23 "io" 24 "sync" 25 "time" 26 27 "github.com/ethereum/go-ethereum/metrics" 28 "github.com/ethereum/go-ethereum/swarm/chunk" 29 "github.com/ethereum/go-ethereum/swarm/log" 30 "github.com/ethereum/go-ethereum/swarm/spancontext" 31 opentracing "github.com/opentracing/opentracing-go" 32 olog "github.com/opentracing/opentracing-go/log" 33 ) 34 35 /* 36 The distributed storage implemented in this package requires fix sized chunks of content. 37 38 Chunker is the interface to a component that is responsible for disassembling and assembling larger data. 39 40 TreeChunker implements a Chunker based on a tree structure defined as follows: 41 42 1 each node in the tree including the root and other branching nodes are stored as a chunk. 43 44 2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children : 45 data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1} 46 47 3 Leaf nodes encode an actual subslice of the input data. 48 49 4 if data size is not more than maximum chunksize, the data is stored in a single chunk 50 key = hash(int64(size) + data) 51 52 5 if data size is more than chunksize*branches^l, but no more than chunksize* 53 branches^(l+1), the data vector is split into slices of chunksize* 54 branches^l length (except the last one). 55 key = hash(int64(size) + key(slice0) + key(slice1) + ...) 56 57 The underlying hash function is configurable 58 */ 59 60 /* 61 Tree chunker is a concrete implementation of data chunking. 62 This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree. 63 64 If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering. 65 The hashing itself does use extra copies and allocation though, since it does need it. 66 */ 67 68 var ( 69 errAppendOppNotSuported = errors.New("Append operation not supported") 70 errOperationTimedOut = errors.New("operation timed out") 71 ) 72 73 type ChunkerParams struct { 74 chunkSize int64 75 hashSize int64 76 } 77 78 type SplitterParams struct { 79 ChunkerParams 80 reader io.Reader 81 putter Putter 82 addr Address 83 } 84 85 type TreeSplitterParams struct { 86 SplitterParams 87 size int64 88 } 89 90 type JoinerParams struct { 91 ChunkerParams 92 addr Address 93 getter Getter 94 // TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344 95 depth int 96 ctx context.Context 97 } 98 99 type TreeChunker struct { 100 ctx context.Context 101 102 branches int64 103 hashFunc SwarmHasher 104 dataSize int64 105 data io.Reader 106 // calculated 107 addr Address 108 depth int 109 hashSize int64 // self.hashFunc.New().Size() 110 chunkSize int64 // hashSize* branches 111 workerCount int64 // the number of worker routines used 112 workerLock sync.RWMutex // lock for the worker count 113 jobC chan *hashJob 114 wg *sync.WaitGroup 115 putter Putter 116 getter Getter 117 errC chan error 118 quitC chan bool 119 } 120 121 /* 122 Join reconstructs original content based on a root key. 123 When joining, the caller gets returned a Lazy SectionReader, which is 124 seekable and implements on-demand fetching of chunks as and where it is read. 125 New chunks to retrieve are coming from the getter, which the caller provides. 126 If an error is encountered during joining, it appears as a reader error. 127 The SectionReader. 128 As a result, partial reads from a document are possible even if other parts 129 are corrupt or lost. 130 The chunks are not meant to be validated by the chunker when joining. This 131 is because it is left to the DPA to decide which sources are trusted. 132 */ 133 func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader { 134 jp := &JoinerParams{ 135 ChunkerParams: ChunkerParams{ 136 chunkSize: chunk.DefaultSize, 137 hashSize: int64(len(addr)), 138 }, 139 addr: addr, 140 getter: getter, 141 depth: depth, 142 ctx: ctx, 143 } 144 145 return NewTreeJoiner(jp).Join(ctx) 146 } 147 148 /* 149 When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes. 150 New chunks to store are store using the putter which the caller provides. 151 */ 152 func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) { 153 tsp := &TreeSplitterParams{ 154 SplitterParams: SplitterParams{ 155 ChunkerParams: ChunkerParams{ 156 chunkSize: chunk.DefaultSize, 157 hashSize: putter.RefSize(), 158 }, 159 reader: data, 160 putter: putter, 161 }, 162 size: size, 163 } 164 return NewTreeSplitter(tsp).Split(ctx) 165 } 166 167 func NewTreeJoiner(params *JoinerParams) *TreeChunker { 168 tc := &TreeChunker{} 169 tc.hashSize = params.hashSize 170 tc.branches = params.chunkSize / params.hashSize 171 tc.addr = params.addr 172 tc.getter = params.getter 173 tc.depth = params.depth 174 tc.chunkSize = params.chunkSize 175 tc.workerCount = 0 176 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 177 tc.wg = &sync.WaitGroup{} 178 tc.errC = make(chan error) 179 tc.quitC = make(chan bool) 180 181 tc.ctx = params.ctx 182 183 return tc 184 } 185 186 func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker { 187 tc := &TreeChunker{} 188 tc.data = params.reader 189 tc.dataSize = params.size 190 tc.hashSize = params.hashSize 191 tc.branches = params.chunkSize / params.hashSize 192 tc.addr = params.addr 193 tc.chunkSize = params.chunkSize 194 tc.putter = params.putter 195 tc.workerCount = 0 196 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 197 tc.wg = &sync.WaitGroup{} 198 tc.errC = make(chan error) 199 tc.quitC = make(chan bool) 200 201 return tc 202 } 203 204 // String() for pretty printing 205 func (c *Chunk) String() string { 206 return fmt.Sprintf("Key: %v TreeSize: %v Chunksize: %v", c.Addr.Log(), c.Size, len(c.SData)) 207 } 208 209 type hashJob struct { 210 key Address 211 chunk []byte 212 size int64 213 parentWg *sync.WaitGroup 214 } 215 216 func (tc *TreeChunker) incrementWorkerCount() { 217 tc.workerLock.Lock() 218 defer tc.workerLock.Unlock() 219 tc.workerCount += 1 220 } 221 222 func (tc *TreeChunker) getWorkerCount() int64 { 223 tc.workerLock.RLock() 224 defer tc.workerLock.RUnlock() 225 return tc.workerCount 226 } 227 228 func (tc *TreeChunker) decrementWorkerCount() { 229 tc.workerLock.Lock() 230 defer tc.workerLock.Unlock() 231 tc.workerCount -= 1 232 } 233 234 func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) { 235 if tc.chunkSize <= 0 { 236 panic("chunker must be initialised") 237 } 238 239 tc.runWorker() 240 241 depth := 0 242 treeSize := tc.chunkSize 243 244 // takes lowest depth such that chunksize*HashCount^(depth+1) > size 245 // power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree. 246 for ; treeSize < tc.dataSize; treeSize *= tc.branches { 247 depth++ 248 } 249 250 key := make([]byte, tc.hashSize) 251 // this waitgroup member is released after the root hash is calculated 252 tc.wg.Add(1) 253 //launch actual recursive function passing the waitgroups 254 go tc.split(depth, treeSize/tc.branches, key, tc.dataSize, tc.wg) 255 256 // closes internal error channel if all subprocesses in the workgroup finished 257 go func() { 258 // waiting for all threads to finish 259 tc.wg.Wait() 260 close(tc.errC) 261 }() 262 263 defer close(tc.quitC) 264 defer tc.putter.Close() 265 select { 266 case err := <-tc.errC: 267 if err != nil { 268 return nil, nil, err 269 } 270 case <-time.NewTimer(splitTimeout).C: 271 return nil, nil, errOperationTimedOut 272 } 273 274 return key, tc.putter.Wait, nil 275 } 276 277 func (tc *TreeChunker) split(depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) { 278 279 // 280 281 for depth > 0 && size < treeSize { 282 treeSize /= tc.branches 283 depth-- 284 } 285 286 if depth == 0 { 287 // leaf nodes -> content chunks 288 chunkData := make([]byte, size+8) 289 binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size)) 290 var readBytes int64 291 for readBytes < size { 292 n, err := tc.data.Read(chunkData[8+readBytes:]) 293 readBytes += int64(n) 294 if err != nil && !(err == io.EOF && readBytes == size) { 295 tc.errC <- err 296 return 297 } 298 } 299 select { 300 case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}: 301 case <-tc.quitC: 302 } 303 return 304 } 305 // dept > 0 306 // intermediate chunk containing child nodes hashes 307 branchCnt := (size + treeSize - 1) / treeSize 308 309 var chunk = make([]byte, branchCnt*tc.hashSize+8) 310 var pos, i int64 311 312 binary.LittleEndian.PutUint64(chunk[0:8], uint64(size)) 313 314 childrenWg := &sync.WaitGroup{} 315 var secSize int64 316 for i < branchCnt { 317 // the last item can have shorter data 318 if size-pos < treeSize { 319 secSize = size - pos 320 } else { 321 secSize = treeSize 322 } 323 // the hash of that data 324 subTreeKey := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize] 325 326 childrenWg.Add(1) 327 tc.split(depth-1, treeSize/tc.branches, subTreeKey, secSize, childrenWg) 328 329 i++ 330 pos += treeSize 331 } 332 // wait for all the children to complete calculating their hashes and copying them onto sections of the chunk 333 // parentWg.Add(1) 334 // go func() { 335 childrenWg.Wait() 336 337 worker := tc.getWorkerCount() 338 if int64(len(tc.jobC)) > worker && worker < ChunkProcessors { 339 tc.runWorker() 340 341 } 342 select { 343 case tc.jobC <- &hashJob{addr, chunk, size, parentWg}: 344 case <-tc.quitC: 345 } 346 } 347 348 func (tc *TreeChunker) runWorker() { 349 tc.incrementWorkerCount() 350 go func() { 351 defer tc.decrementWorkerCount() 352 for { 353 select { 354 355 case job, ok := <-tc.jobC: 356 if !ok { 357 return 358 } 359 360 h, err := tc.putter.Put(tc.ctx, job.chunk) 361 if err != nil { 362 tc.errC <- err 363 return 364 } 365 copy(job.key, h) 366 job.parentWg.Done() 367 case <-tc.quitC: 368 return 369 } 370 } 371 }() 372 } 373 374 func (tc *TreeChunker) Append() (Address, func(), error) { 375 return nil, nil, errAppendOppNotSuported 376 } 377 378 // LazyChunkReader implements LazySectionReader 379 type LazyChunkReader struct { 380 Ctx context.Context 381 key Address // root key 382 chunkData ChunkData 383 off int64 // offset 384 chunkSize int64 // inherit from chunker 385 branches int64 // inherit from chunker 386 hashSize int64 // inherit from chunker 387 depth int 388 getter Getter 389 } 390 391 func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader { 392 return &LazyChunkReader{ 393 key: tc.addr, 394 chunkSize: tc.chunkSize, 395 branches: tc.branches, 396 hashSize: tc.hashSize, 397 depth: tc.depth, 398 getter: tc.getter, 399 Ctx: tc.ctx, 400 } 401 } 402 403 func (r *LazyChunkReader) Context() context.Context { 404 return r.Ctx 405 } 406 407 // Size is meant to be called on the LazySectionReader 408 func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) { 409 metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1) 410 411 var sp opentracing.Span 412 var cctx context.Context 413 cctx, sp = spancontext.StartSpan( 414 ctx, 415 "lcr.size") 416 defer sp.Finish() 417 418 log.Debug("lazychunkreader.size", "key", r.key) 419 if r.chunkData == nil { 420 chunkData, err := r.getter.Get(cctx, Reference(r.key)) 421 if err != nil { 422 return 0, err 423 } 424 if chunkData == nil { 425 select { 426 case <-quitC: 427 return 0, errors.New("aborted") 428 default: 429 return 0, fmt.Errorf("root chunk not found for %v", r.key.Hex()) 430 } 431 } 432 r.chunkData = chunkData 433 } 434 return r.chunkData.Size(), nil 435 } 436 437 // read at can be called numerous times 438 // concurrent reads are allowed 439 // Size() needs to be called synchronously on the LazyChunkReader first 440 func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) { 441 metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1) 442 443 var sp opentracing.Span 444 var cctx context.Context 445 cctx, sp = spancontext.StartSpan( 446 r.Ctx, 447 "lcr.read") 448 defer sp.Finish() 449 450 defer func() { 451 sp.LogFields( 452 olog.Int("off", int(off)), 453 olog.Int("read", read)) 454 }() 455 456 // this is correct, a swarm doc cannot be zero length, so no EOF is expected 457 if len(b) == 0 { 458 return 0, nil 459 } 460 quitC := make(chan bool) 461 size, err := r.Size(cctx, quitC) 462 if err != nil { 463 log.Error("lazychunkreader.readat.size", "size", size, "err", err) 464 return 0, err 465 } 466 467 errC := make(chan error) 468 469 // } 470 var treeSize int64 471 var depth int 472 // calculate depth and max treeSize 473 treeSize = r.chunkSize 474 for ; treeSize < size; treeSize *= r.branches { 475 depth++ 476 } 477 wg := sync.WaitGroup{} 478 length := int64(len(b)) 479 for d := 0; d < r.depth; d++ { 480 off *= r.chunkSize 481 length *= r.chunkSize 482 } 483 wg.Add(1) 484 go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC) 485 go func() { 486 wg.Wait() 487 close(errC) 488 }() 489 490 err = <-errC 491 if err != nil { 492 log.Error("lazychunkreader.readat.errc", "err", err) 493 close(quitC) 494 return 0, err 495 } 496 if off+int64(len(b)) >= size { 497 return int(size - off), io.EOF 498 } 499 return len(b), nil 500 } 501 502 func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) { 503 defer parentWg.Done() 504 // find appropriate block level 505 for chunkData.Size() < treeSize && depth > r.depth { 506 treeSize /= r.branches 507 depth-- 508 } 509 510 // leaf chunk found 511 if depth == r.depth { 512 extra := 8 + eoff - int64(len(chunkData)) 513 if extra > 0 { 514 eoff -= extra 515 } 516 copy(b, chunkData[8+off:8+eoff]) 517 return // simply give back the chunks reader for content chunks 518 } 519 520 // subtree 521 start := off / treeSize 522 end := (eoff + treeSize - 1) / treeSize 523 524 // last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end 525 currentBranches := int64(len(chunkData)-8) / r.hashSize 526 if end > currentBranches { 527 end = currentBranches 528 } 529 530 wg := &sync.WaitGroup{} 531 defer wg.Wait() 532 for i := start; i < end; i++ { 533 soff := i * treeSize 534 roff := soff 535 seoff := soff + treeSize 536 537 if soff < off { 538 soff = off 539 } 540 if seoff > eoff { 541 seoff = eoff 542 } 543 if depth > 1 { 544 wg.Wait() 545 } 546 wg.Add(1) 547 go func(j int64) { 548 childKey := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize] 549 chunkData, err := r.getter.Get(ctx, Reference(childKey)) 550 if err != nil { 551 log.Error("lazychunkreader.join", "key", fmt.Sprintf("%x", childKey), "err", err) 552 select { 553 case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childKey)): 554 case <-quitC: 555 } 556 return 557 } 558 if l := len(chunkData); l < 9 { 559 select { 560 case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childKey), l): 561 case <-quitC: 562 } 563 return 564 } 565 if soff < off { 566 soff = off 567 } 568 r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC) 569 }(i) 570 } //for 571 } 572 573 // Read keeps a cursor so cannot be called simulateously, see ReadAt 574 func (r *LazyChunkReader) Read(b []byte) (read int, err error) { 575 log.Debug("lazychunkreader.read", "key", r.key) 576 metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1) 577 578 read, err = r.ReadAt(b, r.off) 579 if err != nil && err != io.EOF { 580 log.Error("lazychunkreader.readat", "read", read, "err", err) 581 metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1) 582 } 583 584 metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read)) 585 586 r.off += int64(read) 587 return 588 } 589 590 // completely analogous to standard SectionReader implementation 591 var errWhence = errors.New("Seek: invalid whence") 592 var errOffset = errors.New("Seek: invalid offset") 593 594 func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) { 595 log.Debug("lazychunkreader.seek", "key", r.key, "offset", offset) 596 switch whence { 597 default: 598 return 0, errWhence 599 case 0: 600 offset += 0 601 case 1: 602 offset += r.off 603 case 2: 604 if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first 605 _, err := r.Size(context.TODO(), nil) 606 if err != nil { 607 return 0, fmt.Errorf("can't get size: %v", err) 608 } 609 } 610 offset += r.chunkData.Size() 611 } 612 613 if offset < 0 { 614 return 0, errOffset 615 } 616 r.off = offset 617 return offset, nil 618 }