github.com/codingfuture/orig-energi3@v0.8.4/swarm/storage/chunker.go (about) 1 // Copyright 2018 The Energi Core Authors 2 // Copyright 2016 The go-ethereum Authors 3 // This file is part of the Energi Core library. 4 // 5 // The Energi Core library is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Lesser General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // The Energi Core library is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Lesser General Public License for more details. 14 // 15 // You should have received a copy of the GNU Lesser General Public License 16 // along with the Energi Core library. If not, see <http://www.gnu.org/licenses/>. 17 18 package storage 19 20 import ( 21 "context" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 "io" 26 "sync" 27 "time" 28 29 "github.com/ethereum/go-ethereum/metrics" 30 ch "github.com/ethereum/go-ethereum/swarm/chunk" 31 "github.com/ethereum/go-ethereum/swarm/log" 32 "github.com/ethereum/go-ethereum/swarm/spancontext" 33 opentracing "github.com/opentracing/opentracing-go" 34 olog "github.com/opentracing/opentracing-go/log" 35 ) 36 37 /* 38 The distributed storage implemented in this package requires fix sized chunks of content. 39 40 Chunker is the interface to a component that is responsible for disassembling and assembling larger data. 41 42 TreeChunker implements a Chunker based on a tree structure defined as follows: 43 44 1 each node in the tree including the root and other branching nodes are stored as a chunk. 45 46 2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children : 47 data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1} 48 49 3 Leaf nodes encode an actual subslice of the input data. 50 51 4 if data size is not more than maximum chunksize, the data is stored in a single chunk 52 key = hash(int64(size) + data) 53 54 5 if data size is more than chunksize*branches^l, but no more than chunksize* 55 branches^(l+1), the data vector is split into slices of chunksize* 56 branches^l length (except the last one). 57 key = hash(int64(size) + key(slice0) + key(slice1) + ...) 58 59 The underlying hash function is configurable 60 */ 61 62 /* 63 Tree chunker is a concrete implementation of data chunking. 64 This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree. 65 66 If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering. 67 The hashing itself does use extra copies and allocation though, since it does need it. 68 */ 69 70 type ChunkerParams struct { 71 chunkSize int64 72 hashSize int64 73 } 74 75 type SplitterParams struct { 76 ChunkerParams 77 reader io.Reader 78 putter Putter 79 addr Address 80 } 81 82 type TreeSplitterParams struct { 83 SplitterParams 84 size int64 85 } 86 87 type JoinerParams struct { 88 ChunkerParams 89 addr Address 90 getter Getter 91 // TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344 92 depth int 93 ctx context.Context 94 } 95 96 type TreeChunker struct { 97 ctx context.Context 98 99 branches int64 100 dataSize int64 101 data io.Reader 102 // calculated 103 addr Address 104 depth int 105 hashSize int64 // self.hashFunc.New().Size() 106 chunkSize int64 // hashSize* branches 107 workerCount int64 // the number of worker routines used 108 workerLock sync.RWMutex // lock for the worker count 109 jobC chan *hashJob 110 wg *sync.WaitGroup 111 putter Putter 112 getter Getter 113 errC chan error 114 quitC chan bool 115 } 116 117 /* 118 Join reconstructs original content based on a root key. 119 When joining, the caller gets returned a Lazy SectionReader, which is 120 seekable and implements on-demand fetching of chunks as and where it is read. 121 New chunks to retrieve are coming from the getter, which the caller provides. 122 If an error is encountered during joining, it appears as a reader error. 123 The SectionReader. 124 As a result, partial reads from a document are possible even if other parts 125 are corrupt or lost. 126 The chunks are not meant to be validated by the chunker when joining. This 127 is because it is left to the DPA to decide which sources are trusted. 128 */ 129 func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader { 130 jp := &JoinerParams{ 131 ChunkerParams: ChunkerParams{ 132 chunkSize: ch.DefaultSize, 133 hashSize: int64(len(addr)), 134 }, 135 addr: addr, 136 getter: getter, 137 depth: depth, 138 ctx: ctx, 139 } 140 141 return NewTreeJoiner(jp).Join(ctx) 142 } 143 144 /* 145 When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes. 146 New chunks to store are store using the putter which the caller provides. 147 */ 148 func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) { 149 tsp := &TreeSplitterParams{ 150 SplitterParams: SplitterParams{ 151 ChunkerParams: ChunkerParams{ 152 chunkSize: ch.DefaultSize, 153 hashSize: putter.RefSize(), 154 }, 155 reader: data, 156 putter: putter, 157 }, 158 size: size, 159 } 160 return NewTreeSplitter(tsp).Split(ctx) 161 } 162 163 func NewTreeJoiner(params *JoinerParams) *TreeChunker { 164 tc := &TreeChunker{} 165 tc.hashSize = params.hashSize 166 tc.branches = params.chunkSize / params.hashSize 167 tc.addr = params.addr 168 tc.getter = params.getter 169 tc.depth = params.depth 170 tc.chunkSize = params.chunkSize 171 tc.workerCount = 0 172 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 173 tc.wg = &sync.WaitGroup{} 174 tc.errC = make(chan error) 175 tc.quitC = make(chan bool) 176 177 tc.ctx = params.ctx 178 179 return tc 180 } 181 182 func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker { 183 tc := &TreeChunker{} 184 tc.data = params.reader 185 tc.dataSize = params.size 186 tc.hashSize = params.hashSize 187 tc.branches = params.chunkSize / params.hashSize 188 tc.addr = params.addr 189 tc.chunkSize = params.chunkSize 190 tc.putter = params.putter 191 tc.workerCount = 0 192 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 193 tc.wg = &sync.WaitGroup{} 194 tc.errC = make(chan error) 195 tc.quitC = make(chan bool) 196 197 return tc 198 } 199 200 type hashJob struct { 201 key Address 202 chunk []byte 203 size int64 204 parentWg *sync.WaitGroup 205 } 206 207 func (tc *TreeChunker) incrementWorkerCount() { 208 tc.workerLock.Lock() 209 defer tc.workerLock.Unlock() 210 tc.workerCount += 1 211 } 212 213 func (tc *TreeChunker) getWorkerCount() int64 { 214 tc.workerLock.RLock() 215 defer tc.workerLock.RUnlock() 216 return tc.workerCount 217 } 218 219 func (tc *TreeChunker) decrementWorkerCount() { 220 tc.workerLock.Lock() 221 defer tc.workerLock.Unlock() 222 tc.workerCount -= 1 223 } 224 225 func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) { 226 if tc.chunkSize <= 0 { 227 panic("chunker must be initialised") 228 } 229 230 tc.runWorker(ctx) 231 232 depth := 0 233 treeSize := tc.chunkSize 234 235 // takes lowest depth such that chunksize*HashCount^(depth+1) > size 236 // power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree. 237 for ; treeSize < tc.dataSize; treeSize *= tc.branches { 238 depth++ 239 } 240 241 key := make([]byte, tc.hashSize) 242 // this waitgroup member is released after the root hash is calculated 243 tc.wg.Add(1) 244 //launch actual recursive function passing the waitgroups 245 go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg) 246 247 // closes internal error channel if all subprocesses in the workgroup finished 248 go func() { 249 // waiting for all threads to finish 250 tc.wg.Wait() 251 close(tc.errC) 252 }() 253 254 defer close(tc.quitC) 255 defer tc.putter.Close() 256 select { 257 case err := <-tc.errC: 258 if err != nil { 259 return nil, nil, err 260 } 261 case <-ctx.Done(): 262 return nil, nil, ctx.Err() 263 } 264 265 return key, tc.putter.Wait, nil 266 } 267 268 func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) { 269 270 // 271 272 for depth > 0 && size < treeSize { 273 treeSize /= tc.branches 274 depth-- 275 } 276 277 if depth == 0 { 278 // leaf nodes -> content chunks 279 chunkData := make([]byte, size+8) 280 binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size)) 281 var readBytes int64 282 for readBytes < size { 283 n, err := tc.data.Read(chunkData[8+readBytes:]) 284 readBytes += int64(n) 285 if err != nil && !(err == io.EOF && readBytes == size) { 286 tc.errC <- err 287 return 288 } 289 } 290 select { 291 case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}: 292 case <-tc.quitC: 293 } 294 return 295 } 296 // dept > 0 297 // intermediate chunk containing child nodes hashes 298 branchCnt := (size + treeSize - 1) / treeSize 299 300 var chunk = make([]byte, branchCnt*tc.hashSize+8) 301 var pos, i int64 302 303 binary.LittleEndian.PutUint64(chunk[0:8], uint64(size)) 304 305 childrenWg := &sync.WaitGroup{} 306 var secSize int64 307 for i < branchCnt { 308 // the last item can have shorter data 309 if size-pos < treeSize { 310 secSize = size - pos 311 } else { 312 secSize = treeSize 313 } 314 // the hash of that data 315 subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize] 316 317 childrenWg.Add(1) 318 tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg) 319 320 i++ 321 pos += treeSize 322 } 323 // wait for all the children to complete calculating their hashes and copying them onto sections of the chunk 324 // parentWg.Add(1) 325 // go func() { 326 childrenWg.Wait() 327 328 worker := tc.getWorkerCount() 329 if int64(len(tc.jobC)) > worker && worker < ChunkProcessors { 330 tc.runWorker(ctx) 331 332 } 333 select { 334 case tc.jobC <- &hashJob{addr, chunk, size, parentWg}: 335 case <-tc.quitC: 336 } 337 } 338 339 func (tc *TreeChunker) runWorker(ctx context.Context) { 340 tc.incrementWorkerCount() 341 go func() { 342 defer tc.decrementWorkerCount() 343 for { 344 select { 345 346 case job, ok := <-tc.jobC: 347 if !ok { 348 return 349 } 350 351 h, err := tc.putter.Put(ctx, job.chunk) 352 if err != nil { 353 tc.errC <- err 354 return 355 } 356 copy(job.key, h) 357 job.parentWg.Done() 358 case <-tc.quitC: 359 return 360 } 361 } 362 }() 363 } 364 365 // LazyChunkReader implements LazySectionReader 366 type LazyChunkReader struct { 367 ctx context.Context 368 addr Address // root address 369 chunkData ChunkData 370 off int64 // offset 371 chunkSize int64 // inherit from chunker 372 branches int64 // inherit from chunker 373 hashSize int64 // inherit from chunker 374 depth int 375 getter Getter 376 } 377 378 func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader { 379 return &LazyChunkReader{ 380 addr: tc.addr, 381 chunkSize: tc.chunkSize, 382 branches: tc.branches, 383 hashSize: tc.hashSize, 384 depth: tc.depth, 385 getter: tc.getter, 386 ctx: tc.ctx, 387 } 388 } 389 390 func (r *LazyChunkReader) Context() context.Context { 391 return r.ctx 392 } 393 394 // Size is meant to be called on the LazySectionReader 395 func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) { 396 metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1) 397 398 var sp opentracing.Span 399 var cctx context.Context 400 cctx, sp = spancontext.StartSpan( 401 ctx, 402 "lcr.size") 403 defer sp.Finish() 404 405 log.Debug("lazychunkreader.size", "addr", r.addr) 406 if r.chunkData == nil { 407 startTime := time.Now() 408 chunkData, err := r.getter.Get(cctx, Reference(r.addr)) 409 if err != nil { 410 metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime) 411 return 0, err 412 } 413 metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime) 414 r.chunkData = chunkData 415 } 416 417 s := r.chunkData.Size() 418 log.Debug("lazychunkreader.size", "key", r.addr, "size", s) 419 420 return int64(s), nil 421 } 422 423 // read at can be called numerous times 424 // concurrent reads are allowed 425 // Size() needs to be called synchronously on the LazyChunkReader first 426 func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) { 427 metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1) 428 429 var sp opentracing.Span 430 var cctx context.Context 431 cctx, sp = spancontext.StartSpan( 432 r.ctx, 433 "lcr.read") 434 defer sp.Finish() 435 436 defer func() { 437 sp.LogFields( 438 olog.Int("off", int(off)), 439 olog.Int("read", read)) 440 }() 441 442 // this is correct, a swarm doc cannot be zero length, so no EOF is expected 443 if len(b) == 0 { 444 return 0, nil 445 } 446 quitC := make(chan bool) 447 size, err := r.Size(cctx, quitC) 448 if err != nil { 449 log.Debug("lazychunkreader.readat.size", "size", size, "err", err) 450 return 0, err 451 } 452 453 errC := make(chan error) 454 455 // } 456 var treeSize int64 457 var depth int 458 // calculate depth and max treeSize 459 treeSize = r.chunkSize 460 for ; treeSize < size; treeSize *= r.branches { 461 depth++ 462 } 463 wg := sync.WaitGroup{} 464 length := int64(len(b)) 465 for d := 0; d < r.depth; d++ { 466 off *= r.chunkSize 467 length *= r.chunkSize 468 } 469 wg.Add(1) 470 go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC) 471 go func() { 472 wg.Wait() 473 close(errC) 474 }() 475 476 err = <-errC 477 if err != nil { 478 log.Debug("lazychunkreader.readat.errc", "err", err) 479 close(quitC) 480 return 0, err 481 } 482 if off+int64(len(b)) >= size { 483 log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off) 484 return int(size - off), io.EOF 485 } 486 log.Debug("lazychunkreader.readat.errc", "buff", len(b)) 487 return len(b), nil 488 } 489 490 func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) { 491 defer parentWg.Done() 492 // find appropriate block level 493 for chunkData.Size() < uint64(treeSize) && depth > r.depth { 494 treeSize /= r.branches 495 depth-- 496 } 497 498 // leaf chunk found 499 if depth == r.depth { 500 extra := 8 + eoff - int64(len(chunkData)) 501 if extra > 0 { 502 eoff -= extra 503 } 504 copy(b, chunkData[8+off:8+eoff]) 505 return // simply give back the chunks reader for content chunks 506 } 507 508 // subtree 509 start := off / treeSize 510 end := (eoff + treeSize - 1) / treeSize 511 512 // last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end 513 currentBranches := int64(len(chunkData)-8) / r.hashSize 514 if end > currentBranches { 515 end = currentBranches 516 } 517 518 wg := &sync.WaitGroup{} 519 defer wg.Wait() 520 for i := start; i < end; i++ { 521 soff := i * treeSize 522 roff := soff 523 seoff := soff + treeSize 524 525 if soff < off { 526 soff = off 527 } 528 if seoff > eoff { 529 seoff = eoff 530 } 531 if depth > 1 { 532 wg.Wait() 533 } 534 wg.Add(1) 535 go func(j int64) { 536 childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize] 537 startTime := time.Now() 538 chunkData, err := r.getter.Get(ctx, Reference(childAddress)) 539 if err != nil { 540 metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime) 541 log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err) 542 select { 543 case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)): 544 case <-quitC: 545 } 546 return 547 } 548 metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime) 549 if l := len(chunkData); l < 9 { 550 select { 551 case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l): 552 case <-quitC: 553 } 554 return 555 } 556 if soff < off { 557 soff = off 558 } 559 r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC) 560 }(i) 561 } //for 562 } 563 564 // Read keeps a cursor so cannot be called simulateously, see ReadAt 565 func (r *LazyChunkReader) Read(b []byte) (read int, err error) { 566 log.Debug("lazychunkreader.read", "key", r.addr) 567 metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1) 568 569 read, err = r.ReadAt(b, r.off) 570 if err != nil && err != io.EOF { 571 log.Debug("lazychunkreader.readat", "read", read, "err", err) 572 metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1) 573 } 574 575 metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read)) 576 577 r.off += int64(read) 578 return read, err 579 } 580 581 // completely analogous to standard SectionReader implementation 582 var errWhence = errors.New("Seek: invalid whence") 583 var errOffset = errors.New("Seek: invalid offset") 584 585 func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) { 586 cctx, sp := spancontext.StartSpan( 587 r.ctx, 588 "lcr.seek") 589 defer sp.Finish() 590 591 log.Debug("lazychunkreader.seek", "key", r.addr, "offset", offset) 592 switch whence { 593 default: 594 return 0, errWhence 595 case 0: 596 offset += 0 597 case 1: 598 offset += r.off 599 case 2: 600 601 if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first 602 _, err := r.Size(cctx, nil) 603 if err != nil { 604 return 0, fmt.Errorf("can't get size: %v", err) 605 } 606 } 607 offset += int64(r.chunkData.Size()) 608 } 609 610 if offset < 0 { 611 return 0, errOffset 612 } 613 r.off = offset 614 return offset, nil 615 }