github.com/ledgerwatch/erigon-lib@v1.0.0/compress/compress.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package compress 18 19 import ( 20 "bufio" 21 "bytes" 22 "container/heap" 23 "context" 24 "encoding/binary" 25 "errors" 26 "fmt" 27 "io" 28 "math/bits" 29 "os" 30 "path/filepath" 31 "sync" 32 "time" 33 34 "github.com/c2h5oh/datasize" 35 "github.com/ledgerwatch/erigon-lib/common" 36 dir2 "github.com/ledgerwatch/erigon-lib/common/dir" 37 "github.com/ledgerwatch/erigon-lib/etl" 38 "github.com/ledgerwatch/log/v3" 39 "golang.org/x/exp/slices" 40 ) 41 42 // Compressor is the main operating type for performing per-word compression 43 // After creating a compression, one needs to add superstrings to it, using `AddWord` function 44 // In order to add word without compression, function `AddUncompressedWord` needs to be used 45 // Compressor only tracks which words are compressed and which are not until the compressed 46 // file is created. After that, the user of the file needs to know when to call 47 // `Next` or `NextUncompressed` function on the decompressor. 48 // After that, `Compress` function needs to be called to perform the compression 49 // and eventually create output file 50 type Compressor struct { 51 ctx context.Context 52 wg *sync.WaitGroup 53 superstrings chan []byte 54 uncompressedFile *DecompressedFile 55 tmpDir string // temporary directory to use for ETL when building dictionary 56 logPrefix string 57 outputFile string // File where to output the dictionary and compressed data 58 tmpOutFilePath string // File where to output the dictionary and compressed data 59 suffixCollectors []*etl.Collector 60 // Buffer for "superstring" - transformation of superstrings where each byte of a word, say b, 61 // is turned into 2 bytes, 0x01 and b, and two zero bytes 0x00 0x00 are inserted after each word 62 // this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix 63 // sorting algorithm 64 superstring []byte 65 wordsCount uint64 66 superstringCount uint64 67 superstringLen int 68 workers int 69 Ratio CompressionRatio 70 lvl log.Lvl 71 trace bool 72 logger log.Logger 73 noFsync bool // fsync is enabled by default, but tests can manually disable 74 } 75 76 func NewCompressor(ctx context.Context, logPrefix, outputFile, tmpDir string, minPatternScore uint64, workers int, lvl log.Lvl, logger log.Logger) (*Compressor, error) { 77 dir2.MustExist(tmpDir) 78 dir, fileName := filepath.Split(outputFile) 79 tmpOutFilePath := filepath.Join(dir, fileName) + ".tmp" 80 // UncompressedFile - it's intermediate .idt file, outputFile it's final .seg (or .dat) file. 81 // tmpOutFilePath - it's ".seg.tmp" (".idt.tmp") file which will be renamed to .seg file if everything succeed. 82 // It allow atomically create .seg file (downloader will not see partially ready/ non-ready .seg files). 83 // I didn't create ".seg.tmp" file in tmpDir, because I think tmpDir and snapsthoDir may be mounted to different drives 84 uncompressedPath := filepath.Join(tmpDir, fileName) + ".idt" 85 86 uncompressedFile, err := NewUncompressedFile(uncompressedPath) 87 if err != nil { 88 return nil, err 89 } 90 91 // Collector for dictionary superstrings (sorted by their score) 92 superstrings := make(chan []byte, workers*2) 93 wg := &sync.WaitGroup{} 94 wg.Add(workers) 95 suffixCollectors := make([]*etl.Collector, workers) 96 for i := 0; i < workers; i++ { 97 collector := etl.NewCollector(logPrefix+"_dict", tmpDir, etl.NewSortableBuffer(etl.BufferOptimalSize/2), logger) 98 collector.LogLvl(lvl) 99 100 suffixCollectors[i] = collector 101 go processSuperstring(ctx, superstrings, collector, minPatternScore, wg, logger) 102 } 103 104 return &Compressor{ 105 uncompressedFile: uncompressedFile, 106 tmpOutFilePath: tmpOutFilePath, 107 outputFile: outputFile, 108 tmpDir: tmpDir, 109 logPrefix: logPrefix, 110 workers: workers, 111 ctx: ctx, 112 superstrings: superstrings, 113 suffixCollectors: suffixCollectors, 114 lvl: lvl, 115 wg: wg, 116 logger: logger, 117 }, nil 118 } 119 120 func (c *Compressor) Close() { 121 c.uncompressedFile.Close() 122 for _, collector := range c.suffixCollectors { 123 collector.Close() 124 } 125 c.suffixCollectors = nil 126 } 127 128 func (c *Compressor) SetTrace(trace bool) { c.trace = trace } 129 130 func (c *Compressor) Count() int { return int(c.wordsCount) } 131 132 func (c *Compressor) AddWord(word []byte) error { 133 select { 134 case <-c.ctx.Done(): 135 return c.ctx.Err() 136 default: 137 } 138 139 c.wordsCount++ 140 l := 2*len(word) + 2 141 if c.superstringLen+l > superstringLimit { 142 if c.superstringCount%samplingFactor == 0 { 143 c.superstrings <- c.superstring 144 } 145 c.superstringCount++ 146 c.superstring = make([]byte, 0, 1024*1024) 147 c.superstringLen = 0 148 } 149 c.superstringLen += l 150 151 if c.superstringCount%samplingFactor == 0 { 152 for _, a := range word { 153 c.superstring = append(c.superstring, 1, a) 154 } 155 c.superstring = append(c.superstring, 0, 0) 156 } 157 158 return c.uncompressedFile.Append(word) 159 } 160 161 func (c *Compressor) AddUncompressedWord(word []byte) error { 162 select { 163 case <-c.ctx.Done(): 164 return c.ctx.Err() 165 default: 166 } 167 168 c.wordsCount++ 169 return c.uncompressedFile.AppendUncompressed(word) 170 } 171 172 func (c *Compressor) Compress() error { 173 c.uncompressedFile.w.Flush() 174 logEvery := time.NewTicker(20 * time.Second) 175 defer logEvery.Stop() 176 if len(c.superstring) > 0 { 177 c.superstrings <- c.superstring 178 } 179 close(c.superstrings) 180 c.wg.Wait() 181 182 if c.lvl < log.LvlTrace { 183 c.logger.Log(c.lvl, fmt.Sprintf("[%s] BuildDict start", c.logPrefix), "workers", c.workers) 184 } 185 t := time.Now() 186 db, err := DictionaryBuilderFromCollectors(c.ctx, compressLogPrefix, c.tmpDir, c.suffixCollectors, c.lvl, c.logger) 187 if err != nil { 188 189 return err 190 } 191 if c.trace { 192 _, fileName := filepath.Split(c.outputFile) 193 if err := PersistDictrionary(filepath.Join(c.tmpDir, fileName)+".dictionary.txt", db); err != nil { 194 return err 195 } 196 } 197 defer os.Remove(c.tmpOutFilePath) 198 if c.lvl < log.LvlTrace { 199 c.logger.Log(c.lvl, fmt.Sprintf("[%s] BuildDict", c.logPrefix), "took", time.Since(t)) 200 } 201 202 cf, err := os.Create(c.tmpOutFilePath) 203 if err != nil { 204 return err 205 } 206 defer cf.Close() 207 t = time.Now() 208 if err := reducedict(c.ctx, c.trace, c.logPrefix, c.tmpOutFilePath, cf, c.uncompressedFile, c.workers, db, c.lvl, c.logger); err != nil { 209 return err 210 } 211 if err = c.fsync(cf); err != nil { 212 return err 213 } 214 if err = cf.Close(); err != nil { 215 return err 216 } 217 if err := os.Rename(c.tmpOutFilePath, c.outputFile); err != nil { 218 return fmt.Errorf("renaming: %w", err) 219 } 220 221 c.Ratio, err = Ratio(c.uncompressedFile.filePath, c.outputFile) 222 if err != nil { 223 return fmt.Errorf("ratio: %w", err) 224 } 225 226 _, fName := filepath.Split(c.outputFile) 227 if c.lvl < log.LvlTrace { 228 c.logger.Log(c.lvl, fmt.Sprintf("[%s] Compress", c.logPrefix), "took", time.Since(t), "ratio", c.Ratio, "file", fName) 229 } 230 return nil 231 } 232 233 func (c *Compressor) DisableFsync() { c.noFsync = true } 234 235 // fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes. 236 // To achieve it: write to .tmp file then `rename` when file is ready. 237 // Machine may power-off right after `rename` - it means `fsync` must be before `rename` 238 func (c *Compressor) fsync(f *os.File) error { 239 if c.noFsync { 240 return nil 241 } 242 if err := f.Sync(); err != nil { 243 c.logger.Warn("couldn't fsync", "err", err, "file", c.tmpOutFilePath) 244 return err 245 } 246 return nil 247 } 248 249 // superstringLimit limits how large can one "superstring" get before it is processed 250 // CompressorSequential allocates 7 bytes for each uint of superstringLimit. For example, 251 // superstingLimit 16m will result in 112Mb being allocated for various arrays 252 const superstringLimit = 16 * 1024 * 1024 253 254 // minPatternLen is minimum length of pattern we consider to be included into the dictionary 255 const minPatternLen = 5 256 const maxPatternLen = 128 257 258 // maxDictPatterns is the maximum number of patterns allowed in the initial (not reduced dictionary) 259 // Large values increase memory consumption of dictionary reduction phase 260 /* 261 Experiments on 74Gb uncompressed file (bsc 012500-013000-transactions.seg) 262 Ram - needed just to open compressed file (Huff tables, etc...) 263 dec_speed - loop with `word, _ = g.Next(word[:0])` 264 skip_speed - loop with `g.Skip()` 265 | DictSize | Ram | file_size | dec_speed | skip_speed | 266 | -------- | ---- | --------- | --------- | ---------- | 267 | 1M | 70Mb | 35871Mb | 4m06s | 1m58s | 268 | 512K | 42Mb | 36496Mb | 3m49s | 1m51s | 269 | 256K | 21Mb | 37100Mb | 3m44s | 1m48s | 270 | 128K | 11Mb | 37782Mb | 3m25s | 1m44s | 271 | 64K | 7Mb | 38597Mb | 3m16s | 1m34s | 272 | 32K | 5Mb | 39626Mb | 3m0s | 1m29s | 273 274 */ 275 const maxDictPatterns = 64 * 1024 276 277 // samplingFactor - skip superstrings if `superstringNumber % samplingFactor != 0` 278 const samplingFactor = 4 279 280 // nolint 281 const compressLogPrefix = "compress" 282 283 type DictionaryBuilder struct { 284 lastWord []byte 285 items []*Pattern 286 limit int 287 lastWordScore uint64 288 } 289 290 func (db *DictionaryBuilder) Reset(limit int) { 291 db.limit = limit 292 db.items = db.items[:0] 293 } 294 295 func (db *DictionaryBuilder) Len() int { return len(db.items) } 296 func (db *DictionaryBuilder) Less(i, j int) bool { 297 if db.items[i].score == db.items[j].score { 298 return bytes.Compare(db.items[i].word, db.items[j].word) < 0 299 } 300 return db.items[i].score < db.items[j].score 301 } 302 303 func dictionaryBuilderLess(i, j *Pattern) bool { 304 if i.score == j.score { 305 return bytes.Compare(i.word, j.word) < 0 306 } 307 return i.score < j.score 308 } 309 310 func (db *DictionaryBuilder) Swap(i, j int) { 311 db.items[i], db.items[j] = db.items[j], db.items[i] 312 } 313 func (db *DictionaryBuilder) Sort() { slices.SortFunc(db.items, dictionaryBuilderLess) } 314 315 func (db *DictionaryBuilder) Push(x interface{}) { 316 db.items = append(db.items, x.(*Pattern)) 317 } 318 319 func (db *DictionaryBuilder) Pop() interface{} { 320 old := db.items 321 n := len(old) 322 x := old[n-1] 323 old[n-1] = nil 324 db.items = old[0 : n-1] 325 return x 326 } 327 328 func (db *DictionaryBuilder) processWord(chars []byte, score uint64) { 329 heap.Push(db, &Pattern{word: common.Copy(chars), score: score}) 330 if db.Len() > db.limit { 331 // Remove the element with smallest score 332 heap.Pop(db) 333 } 334 } 335 336 func (db *DictionaryBuilder) loadFunc(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error { 337 score := binary.BigEndian.Uint64(v) 338 if bytes.Equal(k, db.lastWord) { 339 db.lastWordScore += score 340 } else { 341 if db.lastWord != nil { 342 db.processWord(db.lastWord, db.lastWordScore) 343 } 344 db.lastWord = append(db.lastWord[:0], k...) 345 db.lastWordScore = score 346 } 347 return nil 348 } 349 350 func (db *DictionaryBuilder) finish() { 351 if db.lastWord != nil { 352 db.processWord(db.lastWord, db.lastWordScore) 353 } 354 } 355 356 func (db *DictionaryBuilder) ForEach(f func(score uint64, word []byte)) { 357 for i := db.Len(); i > 0; i-- { 358 f(db.items[i-1].score, db.items[i-1].word) 359 } 360 } 361 362 func (db *DictionaryBuilder) Close() { 363 db.items = nil 364 db.lastWord = nil 365 } 366 367 // Pattern is representation of a pattern that is searched in the superstrings to compress them 368 // patterns are stored in a patricia tree and contain pattern score (calculated during 369 // the initial dictionary building), frequency of usage, and code 370 type Pattern struct { 371 word []byte // Pattern characters 372 score uint64 // Score assigned to the pattern during dictionary building 373 uses uint64 // How many times this pattern has been used during search and optimisation 374 code uint64 // Allocated numerical code 375 codeBits int // Number of bits in the code 376 depth int // Depth of the pattern in the huffman tree (for encoding in the file) 377 } 378 379 // PatternList is a sorted list of pattern for the purpose of 380 // building Huffman tree to determine efficient coding. 381 // Patterns with least usage come first, we use numerical code 382 // as a tie breaker to make sure the resulting Huffman code is canonical 383 type PatternList []*Pattern 384 385 func (pl PatternList) Len() int { return len(pl) } 386 func patternListLess(i, j *Pattern) bool { 387 if i.uses == j.uses { 388 return bits.Reverse64(i.code) < bits.Reverse64(j.code) 389 } 390 return i.uses < j.uses 391 } 392 393 // PatternHuff is an intermediate node in a huffman tree of patterns 394 // It has two children, each of which may either be another intermediate node (h0 or h1) 395 // or leaf node, which is Pattern (p0 or p1). 396 type PatternHuff struct { 397 p0 *Pattern 398 p1 *Pattern 399 h0 *PatternHuff 400 h1 *PatternHuff 401 uses uint64 402 tieBreaker uint64 403 } 404 405 func (h *PatternHuff) AddZero() { 406 if h.p0 != nil { 407 h.p0.code <<= 1 408 h.p0.codeBits++ 409 } else { 410 h.h0.AddZero() 411 } 412 if h.p1 != nil { 413 h.p1.code <<= 1 414 h.p1.codeBits++ 415 } else { 416 h.h1.AddZero() 417 } 418 } 419 420 func (h *PatternHuff) AddOne() { 421 if h.p0 != nil { 422 h.p0.code <<= 1 423 h.p0.code++ 424 h.p0.codeBits++ 425 } else { 426 h.h0.AddOne() 427 } 428 if h.p1 != nil { 429 h.p1.code <<= 1 430 h.p1.code++ 431 h.p1.codeBits++ 432 } else { 433 h.h1.AddOne() 434 } 435 } 436 437 func (h *PatternHuff) SetDepth(depth int) { 438 if h.p0 != nil { 439 h.p0.depth = depth + 1 440 h.p0.uses = 0 441 } 442 if h.p1 != nil { 443 h.p1.depth = depth + 1 444 h.p1.uses = 0 445 } 446 if h.h0 != nil { 447 h.h0.SetDepth(depth + 1) 448 } 449 if h.h1 != nil { 450 h.h1.SetDepth(depth + 1) 451 } 452 } 453 454 // PatternHeap is priority queue of pattern for the purpose of building 455 // Huffman tree to determine efficient coding. Patterns with least usage 456 // have highest priority. We use a tie-breaker to make sure 457 // the resulting Huffman code is canonical 458 type PatternHeap []*PatternHuff 459 460 func (ph PatternHeap) Len() int { 461 return len(ph) 462 } 463 464 func (ph PatternHeap) Less(i, j int) bool { 465 if ph[i].uses == ph[j].uses { 466 return ph[i].tieBreaker < ph[j].tieBreaker 467 } 468 return ph[i].uses < ph[j].uses 469 } 470 471 func (ph *PatternHeap) Swap(i, j int) { 472 (*ph)[i], (*ph)[j] = (*ph)[j], (*ph)[i] 473 } 474 475 func (ph *PatternHeap) Push(x interface{}) { 476 *ph = append(*ph, x.(*PatternHuff)) 477 } 478 479 func (ph *PatternHeap) Pop() interface{} { 480 old := *ph 481 n := len(old) 482 x := old[n-1] 483 old[n-1] = nil 484 *ph = old[0 : n-1] 485 return x 486 } 487 488 type Position struct { 489 uses uint64 490 pos uint64 491 code uint64 492 codeBits int 493 depth int // Depth of the position in the huffman tree (for encoding in the file) 494 } 495 496 type PositionHuff struct { 497 p0 *Position 498 p1 *Position 499 h0 *PositionHuff 500 h1 *PositionHuff 501 uses uint64 502 tieBreaker uint64 503 } 504 505 func (h *PositionHuff) AddZero() { 506 if h.p0 != nil { 507 h.p0.code <<= 1 508 h.p0.codeBits++ 509 } else { 510 h.h0.AddZero() 511 } 512 if h.p1 != nil { 513 h.p1.code <<= 1 514 h.p1.codeBits++ 515 } else { 516 h.h1.AddZero() 517 } 518 } 519 520 func (h *PositionHuff) AddOne() { 521 if h.p0 != nil { 522 h.p0.code <<= 1 523 h.p0.code++ 524 h.p0.codeBits++ 525 } else { 526 h.h0.AddOne() 527 } 528 if h.p1 != nil { 529 h.p1.code <<= 1 530 h.p1.code++ 531 h.p1.codeBits++ 532 } else { 533 h.h1.AddOne() 534 } 535 } 536 537 func (h *PositionHuff) SetDepth(depth int) { 538 if h.p0 != nil { 539 h.p0.depth = depth + 1 540 h.p0.uses = 0 541 } 542 if h.p1 != nil { 543 h.p1.depth = depth + 1 544 h.p1.uses = 0 545 } 546 if h.h0 != nil { 547 h.h0.SetDepth(depth + 1) 548 } 549 if h.h1 != nil { 550 h.h1.SetDepth(depth + 1) 551 } 552 } 553 554 type PositionList []*Position 555 556 func (pl PositionList) Len() int { return len(pl) } 557 558 func positionListLess(i, j *Position) bool { 559 if i.uses == j.uses { 560 return bits.Reverse64(i.code) < bits.Reverse64(j.code) 561 } 562 return i.uses < j.uses 563 } 564 565 type PositionHeap []*PositionHuff 566 567 func (ph PositionHeap) Len() int { 568 return len(ph) 569 } 570 571 func (ph PositionHeap) Less(i, j int) bool { 572 if ph[i].uses == ph[j].uses { 573 return ph[i].tieBreaker < ph[j].tieBreaker 574 } 575 return ph[i].uses < ph[j].uses 576 } 577 578 func (ph *PositionHeap) Swap(i, j int) { 579 (*ph)[i], (*ph)[j] = (*ph)[j], (*ph)[i] 580 } 581 582 func (ph *PositionHeap) Push(x interface{}) { 583 *ph = append(*ph, x.(*PositionHuff)) 584 } 585 586 func (ph *PositionHeap) Pop() interface{} { 587 old := *ph 588 n := len(old) 589 x := old[n-1] 590 old[n-1] = nil 591 *ph = old[0 : n-1] 592 return x 593 } 594 595 type HuffmanCoder struct { 596 w *bufio.Writer 597 outputBits int 598 outputByte byte 599 } 600 601 func (hf *HuffmanCoder) encode(code uint64, codeBits int) error { 602 for codeBits > 0 { 603 var bitsUsed int 604 if hf.outputBits+codeBits > 8 { 605 bitsUsed = 8 - hf.outputBits 606 } else { 607 bitsUsed = codeBits 608 } 609 mask := (uint64(1) << bitsUsed) - 1 610 hf.outputByte |= byte((code & mask) << hf.outputBits) 611 code >>= bitsUsed 612 codeBits -= bitsUsed 613 hf.outputBits += bitsUsed 614 if hf.outputBits == 8 { 615 if e := hf.w.WriteByte(hf.outputByte); e != nil { 616 return e 617 } 618 hf.outputBits = 0 619 hf.outputByte = 0 620 } 621 } 622 return nil 623 } 624 625 func (hf *HuffmanCoder) flush() error { 626 if hf.outputBits > 0 { 627 if e := hf.w.WriteByte(hf.outputByte); e != nil { 628 return e 629 } 630 hf.outputBits = 0 631 hf.outputByte = 0 632 } 633 return nil 634 } 635 636 // DynamicCell represents result of dynamic programming for certain starting position 637 type DynamicCell struct { 638 optimStart int 639 coverStart int 640 compression int 641 score uint64 642 patternIdx int // offset of the last element in the pattern slice 643 } 644 645 type Ring struct { 646 cells []DynamicCell 647 head, tail, count int 648 } 649 650 func NewRing() *Ring { 651 return &Ring{ 652 cells: make([]DynamicCell, 16), 653 head: 0, 654 tail: 0, 655 count: 0, 656 } 657 } 658 659 func (r *Ring) Reset() { 660 r.count = 0 661 r.head = 0 662 r.tail = 0 663 } 664 665 func (r *Ring) ensureSize() { 666 if r.count < len(r.cells) { 667 return 668 } 669 newcells := make([]DynamicCell, r.count*2) 670 if r.tail > r.head { 671 copy(newcells, r.cells[r.head:r.tail]) 672 } else { 673 n := copy(newcells, r.cells[r.head:]) 674 copy(newcells[n:], r.cells[:r.tail]) 675 } 676 r.head = 0 677 r.tail = r.count 678 r.cells = newcells 679 } 680 681 func (r *Ring) PushFront() *DynamicCell { 682 r.ensureSize() 683 if r.head == 0 { 684 r.head = len(r.cells) 685 } 686 r.head-- 687 r.count++ 688 return &r.cells[r.head] 689 } 690 691 func (r *Ring) PushBack() *DynamicCell { 692 r.ensureSize() 693 if r.tail == len(r.cells) { 694 r.tail = 0 695 } 696 result := &r.cells[r.tail] 697 r.tail++ 698 r.count++ 699 return result 700 } 701 702 func (r Ring) Len() int { 703 return r.count 704 } 705 706 func (r *Ring) Get(i int) *DynamicCell { 707 if i < 0 || i >= r.count { 708 return nil 709 } 710 return &r.cells[(r.head+i)&(len(r.cells)-1)] 711 } 712 713 // Truncate removes all items starting from i 714 func (r *Ring) Truncate(i int) { 715 r.count = i 716 r.tail = (r.head + i) & (len(r.cells) - 1) 717 } 718 719 type DictAggregator struct { 720 collector *etl.Collector 721 dist map[int]int 722 lastWord []byte 723 lastWordScore uint64 724 } 725 726 func (da *DictAggregator) processWord(word []byte, score uint64) error { 727 var scoreBuf [8]byte 728 binary.BigEndian.PutUint64(scoreBuf[:], score) 729 return da.collector.Collect(word, scoreBuf[:]) 730 } 731 732 func (da *DictAggregator) Load(loadFunc etl.LoadFunc, args etl.TransformArgs) error { 733 defer da.collector.Close() 734 return da.collector.Load(nil, "", loadFunc, args) 735 } 736 737 func (da *DictAggregator) aggLoadFunc(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error { 738 if _, ok := da.dist[len(k)]; !ok { 739 da.dist[len(k)] = 0 740 } 741 da.dist[len(k)]++ 742 743 score := binary.BigEndian.Uint64(v) 744 if bytes.Equal(k, da.lastWord) { 745 da.lastWordScore += score 746 } else { 747 if da.lastWord != nil { 748 if err := da.processWord(da.lastWord, da.lastWordScore); err != nil { 749 return err 750 } 751 } 752 da.lastWord = append(da.lastWord[:0], k...) 753 da.lastWordScore = score 754 } 755 return nil 756 } 757 758 func (da *DictAggregator) finish() error { 759 if da.lastWord != nil { 760 return da.processWord(da.lastWord, da.lastWordScore) 761 } 762 return nil 763 } 764 765 type CompressionRatio float64 766 767 func (r CompressionRatio) String() string { return fmt.Sprintf("%.2f", r) } 768 769 func Ratio(f1, f2 string) (CompressionRatio, error) { 770 s1, err := os.Stat(f1) 771 if err != nil { 772 return 0, err 773 } 774 s2, err := os.Stat(f2) 775 if err != nil { 776 return 0, err 777 } 778 return CompressionRatio(float64(s1.Size()) / float64(s2.Size())), nil 779 } 780 781 // DecompressedFile - .dat file format - simple format for temporary data store 782 type DecompressedFile struct { 783 f *os.File 784 w *bufio.Writer 785 filePath string 786 buf []byte 787 count uint64 788 } 789 790 func NewUncompressedFile(filePath string) (*DecompressedFile, error) { 791 f, err := os.Create(filePath) 792 if err != nil { 793 return nil, err 794 } 795 w := bufio.NewWriterSize(f, 2*etl.BufIOSize) 796 return &DecompressedFile{filePath: filePath, f: f, w: w, buf: make([]byte, 128)}, nil 797 } 798 func (f *DecompressedFile) Close() { 799 f.w.Flush() 800 f.f.Close() 801 os.Remove(f.filePath) 802 } 803 func (f *DecompressedFile) Append(v []byte) error { 804 f.count++ 805 // For compressed words, the length prefix is shifted to make lowest bit zero 806 n := binary.PutUvarint(f.buf, 2*uint64(len(v))) 807 if _, e := f.w.Write(f.buf[:n]); e != nil { 808 return e 809 } 810 if len(v) > 0 { 811 if _, e := f.w.Write(v); e != nil { 812 return e 813 } 814 } 815 return nil 816 } 817 func (f *DecompressedFile) AppendUncompressed(v []byte) error { 818 f.count++ 819 // For uncompressed words, the length prefix is shifted to make lowest bit one 820 n := binary.PutUvarint(f.buf, 2*uint64(len(v))+1) 821 if _, e := f.w.Write(f.buf[:n]); e != nil { 822 return e 823 } 824 if len(v) > 0 { 825 if _, e := f.w.Write(v); e != nil { 826 return e 827 } 828 } 829 return nil 830 } 831 832 // ForEach - Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values) 833 // We only consider values with length > 2, because smaller values are not compressible without going into bits 834 func (f *DecompressedFile) ForEach(walker func(v []byte, compressed bool) error) error { 835 _, err := f.f.Seek(0, 0) 836 if err != nil { 837 return err 838 } 839 r := bufio.NewReaderSize(f.f, int(8*datasize.MB)) 840 buf := make([]byte, 16*1024) 841 l, e := binary.ReadUvarint(r) 842 for ; e == nil; l, e = binary.ReadUvarint(r) { 843 // extract lowest bit of length prefix as "uncompressed" flag and shift to obtain correct length 844 compressed := (l & 1) == 0 845 l >>= 1 846 if len(buf) < int(l) { 847 buf = make([]byte, l) 848 } 849 if _, e = io.ReadFull(r, buf[:l]); e != nil { 850 return e 851 } 852 if err := walker(buf[:l], compressed); err != nil { 853 return err 854 } 855 } 856 if e != nil && !errors.Is(e, io.EOF) { 857 return e 858 } 859 return nil 860 }