github.com/xxRanger/go-ethereum@v1.8.23/swarm/storage/chunker.go (about) 1 // Copyright 2016 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 package storage 17 18 import ( 19 "context" 20 "encoding/binary" 21 "errors" 22 "fmt" 23 "io" 24 "sync" 25 "time" 26 27 "github.com/ethereum/go-ethereum/metrics" 28 ch "github.com/ethereum/go-ethereum/swarm/chunk" 29 "github.com/ethereum/go-ethereum/swarm/log" 30 "github.com/ethereum/go-ethereum/swarm/spancontext" 31 opentracing "github.com/opentracing/opentracing-go" 32 olog "github.com/opentracing/opentracing-go/log" 33 ) 34 35 /* 36 The distributed storage implemented in this package requires fix sized chunks of content. 37 38 Chunker is the interface to a component that is responsible for disassembling and assembling larger data. 39 40 TreeChunker implements a Chunker based on a tree structure defined as follows: 41 42 1 each node in the tree including the root and other branching nodes are stored as a chunk. 43 44 2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children : 45 data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1} 46 47 3 Leaf nodes encode an actual subslice of the input data. 48 49 4 if data size is not more than maximum chunksize, the data is stored in a single chunk 50 key = hash(int64(size) + data) 51 52 5 if data size is more than chunksize*branches^l, but no more than chunksize* 53 branches^(l+1), the data vector is split into slices of chunksize* 54 branches^l length (except the last one). 55 key = hash(int64(size) + key(slice0) + key(slice1) + ...) 56 57 The underlying hash function is configurable 58 */ 59 60 /* 61 Tree chunker is a concrete implementation of data chunking. 62 This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree. 63 64 If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering. 65 The hashing itself does use extra copies and allocation though, since it does need it. 66 */ 67 68 type ChunkerParams struct { 69 chunkSize int64 70 hashSize int64 71 } 72 73 type SplitterParams struct { 74 ChunkerParams 75 reader io.Reader 76 putter Putter 77 addr Address 78 } 79 80 type TreeSplitterParams struct { 81 SplitterParams 82 size int64 83 } 84 85 type JoinerParams struct { 86 ChunkerParams 87 addr Address 88 getter Getter 89 // TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344 90 depth int 91 ctx context.Context 92 } 93 94 type TreeChunker struct { 95 ctx context.Context 96 97 branches int64 98 dataSize int64 99 data io.Reader 100 // calculated 101 addr Address 102 depth int 103 hashSize int64 // self.hashFunc.New().Size() 104 chunkSize int64 // hashSize* branches 105 workerCount int64 // the number of worker routines used 106 workerLock sync.RWMutex // lock for the worker count 107 jobC chan *hashJob 108 wg *sync.WaitGroup 109 putter Putter 110 getter Getter 111 errC chan error 112 quitC chan bool 113 } 114 115 /* 116 Join reconstructs original content based on a root key. 117 When joining, the caller gets returned a Lazy SectionReader, which is 118 seekable and implements on-demand fetching of chunks as and where it is read. 119 New chunks to retrieve are coming from the getter, which the caller provides. 120 If an error is encountered during joining, it appears as a reader error. 121 The SectionReader. 122 As a result, partial reads from a document are possible even if other parts 123 are corrupt or lost. 124 The chunks are not meant to be validated by the chunker when joining. This 125 is because it is left to the DPA to decide which sources are trusted. 126 */ 127 func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader { 128 jp := &JoinerParams{ 129 ChunkerParams: ChunkerParams{ 130 chunkSize: ch.DefaultSize, 131 hashSize: int64(len(addr)), 132 }, 133 addr: addr, 134 getter: getter, 135 depth: depth, 136 ctx: ctx, 137 } 138 139 return NewTreeJoiner(jp).Join(ctx) 140 } 141 142 /* 143 When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes. 144 New chunks to store are store using the putter which the caller provides. 145 */ 146 func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) { 147 tsp := &TreeSplitterParams{ 148 SplitterParams: SplitterParams{ 149 ChunkerParams: ChunkerParams{ 150 chunkSize: ch.DefaultSize, 151 hashSize: putter.RefSize(), 152 }, 153 reader: data, 154 putter: putter, 155 }, 156 size: size, 157 } 158 return NewTreeSplitter(tsp).Split(ctx) 159 } 160 161 func NewTreeJoiner(params *JoinerParams) *TreeChunker { 162 tc := &TreeChunker{} 163 tc.hashSize = params.hashSize 164 tc.branches = params.chunkSize / params.hashSize 165 tc.addr = params.addr 166 tc.getter = params.getter 167 tc.depth = params.depth 168 tc.chunkSize = params.chunkSize 169 tc.workerCount = 0 170 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 171 tc.wg = &sync.WaitGroup{} 172 tc.errC = make(chan error) 173 tc.quitC = make(chan bool) 174 175 tc.ctx = params.ctx 176 177 return tc 178 } 179 180 func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker { 181 tc := &TreeChunker{} 182 tc.data = params.reader 183 tc.dataSize = params.size 184 tc.hashSize = params.hashSize 185 tc.branches = params.chunkSize / params.hashSize 186 tc.addr = params.addr 187 tc.chunkSize = params.chunkSize 188 tc.putter = params.putter 189 tc.workerCount = 0 190 tc.jobC = make(chan *hashJob, 2*ChunkProcessors) 191 tc.wg = &sync.WaitGroup{} 192 tc.errC = make(chan error) 193 tc.quitC = make(chan bool) 194 195 return tc 196 } 197 198 type hashJob struct { 199 key Address 200 chunk []byte 201 size int64 202 parentWg *sync.WaitGroup 203 } 204 205 func (tc *TreeChunker) incrementWorkerCount() { 206 tc.workerLock.Lock() 207 defer tc.workerLock.Unlock() 208 tc.workerCount += 1 209 } 210 211 func (tc *TreeChunker) getWorkerCount() int64 { 212 tc.workerLock.RLock() 213 defer tc.workerLock.RUnlock() 214 return tc.workerCount 215 } 216 217 func (tc *TreeChunker) decrementWorkerCount() { 218 tc.workerLock.Lock() 219 defer tc.workerLock.Unlock() 220 tc.workerCount -= 1 221 } 222 223 func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) { 224 if tc.chunkSize <= 0 { 225 panic("chunker must be initialised") 226 } 227 228 tc.runWorker(ctx) 229 230 depth := 0 231 treeSize := tc.chunkSize 232 233 // takes lowest depth such that chunksize*HashCount^(depth+1) > size 234 // power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree. 235 for ; treeSize < tc.dataSize; treeSize *= tc.branches { 236 depth++ 237 } 238 239 key := make([]byte, tc.hashSize) 240 // this waitgroup member is released after the root hash is calculated 241 tc.wg.Add(1) 242 //launch actual recursive function passing the waitgroups 243 go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg) 244 245 // closes internal error channel if all subprocesses in the workgroup finished 246 go func() { 247 // waiting for all threads to finish 248 tc.wg.Wait() 249 close(tc.errC) 250 }() 251 252 defer close(tc.quitC) 253 defer tc.putter.Close() 254 select { 255 case err := <-tc.errC: 256 if err != nil { 257 return nil, nil, err 258 } 259 case <-ctx.Done(): 260 return nil, nil, ctx.Err() 261 } 262 263 return key, tc.putter.Wait, nil 264 } 265 266 func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) { 267 268 // 269 270 for depth > 0 && size < treeSize { 271 treeSize /= tc.branches 272 depth-- 273 } 274 275 if depth == 0 { 276 // leaf nodes -> content chunks 277 chunkData := make([]byte, size+8) 278 binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size)) 279 var readBytes int64 280 for readBytes < size { 281 n, err := tc.data.Read(chunkData[8+readBytes:]) 282 readBytes += int64(n) 283 if err != nil && !(err == io.EOF && readBytes == size) { 284 tc.errC <- err 285 return 286 } 287 } 288 select { 289 case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}: 290 case <-tc.quitC: 291 } 292 return 293 } 294 // dept > 0 295 // intermediate chunk containing child nodes hashes 296 branchCnt := (size + treeSize - 1) / treeSize 297 298 var chunk = make([]byte, branchCnt*tc.hashSize+8) 299 var pos, i int64 300 301 binary.LittleEndian.PutUint64(chunk[0:8], uint64(size)) 302 303 childrenWg := &sync.WaitGroup{} 304 var secSize int64 305 for i < branchCnt { 306 // the last item can have shorter data 307 if size-pos < treeSize { 308 secSize = size - pos 309 } else { 310 secSize = treeSize 311 } 312 // the hash of that data 313 subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize] 314 315 childrenWg.Add(1) 316 tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg) 317 318 i++ 319 pos += treeSize 320 } 321 // wait for all the children to complete calculating their hashes and copying them onto sections of the chunk 322 // parentWg.Add(1) 323 // go func() { 324 childrenWg.Wait() 325 326 worker := tc.getWorkerCount() 327 if int64(len(tc.jobC)) > worker && worker < ChunkProcessors { 328 tc.runWorker(ctx) 329 330 } 331 select { 332 case tc.jobC <- &hashJob{addr, chunk, size, parentWg}: 333 case <-tc.quitC: 334 } 335 } 336 337 func (tc *TreeChunker) runWorker(ctx context.Context) { 338 tc.incrementWorkerCount() 339 go func() { 340 defer tc.decrementWorkerCount() 341 for { 342 select { 343 344 case job, ok := <-tc.jobC: 345 if !ok { 346 return 347 } 348 349 h, err := tc.putter.Put(ctx, job.chunk) 350 if err != nil { 351 tc.errC <- err 352 return 353 } 354 copy(job.key, h) 355 job.parentWg.Done() 356 case <-tc.quitC: 357 return 358 } 359 } 360 }() 361 } 362 363 // LazyChunkReader implements LazySectionReader 364 type LazyChunkReader struct { 365 ctx context.Context 366 addr Address // root address 367 chunkData ChunkData 368 off int64 // offset 369 chunkSize int64 // inherit from chunker 370 branches int64 // inherit from chunker 371 hashSize int64 // inherit from chunker 372 depth int 373 getter Getter 374 } 375 376 func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader { 377 return &LazyChunkReader{ 378 addr: tc.addr, 379 chunkSize: tc.chunkSize, 380 branches: tc.branches, 381 hashSize: tc.hashSize, 382 depth: tc.depth, 383 getter: tc.getter, 384 ctx: tc.ctx, 385 } 386 } 387 388 func (r *LazyChunkReader) Context() context.Context { 389 return r.ctx 390 } 391 392 // Size is meant to be called on the LazySectionReader 393 func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) { 394 metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1) 395 396 var sp opentracing.Span 397 var cctx context.Context 398 cctx, sp = spancontext.StartSpan( 399 ctx, 400 "lcr.size") 401 defer sp.Finish() 402 403 log.Debug("lazychunkreader.size", "addr", r.addr) 404 if r.chunkData == nil { 405 startTime := time.Now() 406 chunkData, err := r.getter.Get(cctx, Reference(r.addr)) 407 if err != nil { 408 metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime) 409 return 0, err 410 } 411 metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime) 412 r.chunkData = chunkData 413 } 414 415 s := r.chunkData.Size() 416 log.Debug("lazychunkreader.size", "key", r.addr, "size", s) 417 418 return int64(s), nil 419 } 420 421 // read at can be called numerous times 422 // concurrent reads are allowed 423 // Size() needs to be called synchronously on the LazyChunkReader first 424 func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) { 425 metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1) 426 427 var sp opentracing.Span 428 var cctx context.Context 429 cctx, sp = spancontext.StartSpan( 430 r.ctx, 431 "lcr.read") 432 defer sp.Finish() 433 434 defer func() { 435 sp.LogFields( 436 olog.Int("off", int(off)), 437 olog.Int("read", read)) 438 }() 439 440 // this is correct, a swarm doc cannot be zero length, so no EOF is expected 441 if len(b) == 0 { 442 return 0, nil 443 } 444 quitC := make(chan bool) 445 size, err := r.Size(cctx, quitC) 446 if err != nil { 447 log.Debug("lazychunkreader.readat.size", "size", size, "err", err) 448 return 0, err 449 } 450 451 errC := make(chan error) 452 453 // } 454 var treeSize int64 455 var depth int 456 // calculate depth and max treeSize 457 treeSize = r.chunkSize 458 for ; treeSize < size; treeSize *= r.branches { 459 depth++ 460 } 461 wg := sync.WaitGroup{} 462 length := int64(len(b)) 463 for d := 0; d < r.depth; d++ { 464 off *= r.chunkSize 465 length *= r.chunkSize 466 } 467 wg.Add(1) 468 go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC) 469 go func() { 470 wg.Wait() 471 close(errC) 472 }() 473 474 err = <-errC 475 if err != nil { 476 log.Debug("lazychunkreader.readat.errc", "err", err) 477 close(quitC) 478 return 0, err 479 } 480 if off+int64(len(b)) >= size { 481 log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off) 482 return int(size - off), io.EOF 483 } 484 log.Debug("lazychunkreader.readat.errc", "buff", len(b)) 485 return len(b), nil 486 } 487 488 func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) { 489 defer parentWg.Done() 490 // find appropriate block level 491 for chunkData.Size() < uint64(treeSize) && depth > r.depth { 492 treeSize /= r.branches 493 depth-- 494 } 495 496 // leaf chunk found 497 if depth == r.depth { 498 extra := 8 + eoff - int64(len(chunkData)) 499 if extra > 0 { 500 eoff -= extra 501 } 502 copy(b, chunkData[8+off:8+eoff]) 503 return // simply give back the chunks reader for content chunks 504 } 505 506 // subtree 507 start := off / treeSize 508 end := (eoff + treeSize - 1) / treeSize 509 510 // last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end 511 currentBranches := int64(len(chunkData)-8) / r.hashSize 512 if end > currentBranches { 513 end = currentBranches 514 } 515 516 wg := &sync.WaitGroup{} 517 defer wg.Wait() 518 for i := start; i < end; i++ { 519 soff := i * treeSize 520 roff := soff 521 seoff := soff + treeSize 522 523 if soff < off { 524 soff = off 525 } 526 if seoff > eoff { 527 seoff = eoff 528 } 529 if depth > 1 { 530 wg.Wait() 531 } 532 wg.Add(1) 533 go func(j int64) { 534 childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize] 535 startTime := time.Now() 536 chunkData, err := r.getter.Get(ctx, Reference(childAddress)) 537 if err != nil { 538 metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime) 539 log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err) 540 select { 541 case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)): 542 case <-quitC: 543 } 544 return 545 } 546 metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime) 547 if l := len(chunkData); l < 9 { 548 select { 549 case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l): 550 case <-quitC: 551 } 552 return 553 } 554 if soff < off { 555 soff = off 556 } 557 r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC) 558 }(i) 559 } //for 560 } 561 562 // Read keeps a cursor so cannot be called simulateously, see ReadAt 563 func (r *LazyChunkReader) Read(b []byte) (read int, err error) { 564 log.Debug("lazychunkreader.read", "key", r.addr) 565 metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1) 566 567 read, err = r.ReadAt(b, r.off) 568 if err != nil && err != io.EOF { 569 log.Debug("lazychunkreader.readat", "read", read, "err", err) 570 metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1) 571 } 572 573 metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read)) 574 575 r.off += int64(read) 576 return read, err 577 } 578 579 // completely analogous to standard SectionReader implementation 580 var errWhence = errors.New("Seek: invalid whence") 581 var errOffset = errors.New("Seek: invalid offset") 582 583 func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) { 584 cctx, sp := spancontext.StartSpan( 585 r.ctx, 586 "lcr.seek") 587 defer sp.Finish() 588 589 log.Debug("lazychunkreader.seek", "key", r.addr, "offset", offset) 590 switch whence { 591 default: 592 return 0, errWhence 593 case 0: 594 offset += 0 595 case 1: 596 offset += r.off 597 case 2: 598 599 if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first 600 _, err := r.Size(cctx, nil) 601 if err != nil { 602 return 0, fmt.Errorf("can't get size: %v", err) 603 } 604 } 605 offset += int64(r.chunkData.Size()) 606 } 607 608 if offset < 0 { 609 return 0, errOffset 610 } 611 r.off = offset 612 return offset, nil 613 }