github.com/ledgerwatch/erigon-lib@v1.0.0/compress/parallel_compress.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package compress 18 19 import ( 20 "bufio" 21 "container/heap" 22 "context" 23 "encoding/binary" 24 "errors" 25 "fmt" 26 "io" 27 "os" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "github.com/ledgerwatch/erigon-lib/common" 33 "github.com/ledgerwatch/erigon-lib/common/assert" 34 "github.com/ledgerwatch/erigon-lib/etl" 35 "github.com/ledgerwatch/erigon-lib/patricia" 36 "github.com/ledgerwatch/erigon-lib/sais" 37 "github.com/ledgerwatch/log/v3" 38 "golang.org/x/exp/slices" 39 ) 40 41 // MinPatternScore is minimum score (per superstring) required to consider including pattern into the dictionary 42 const MinPatternScore = 1024 43 44 func optimiseCluster(trace bool, input []byte, mf2 *patricia.MatchFinder2, output []byte, uncovered []int, patterns []int, cellRing *Ring, posMap map[uint64]uint64) ([]byte, []int, []int) { 45 matches := mf2.FindLongestMatches(input) 46 47 if len(matches) == 0 { 48 output = append(output, 0) // Encoding of 0 in VarUint is 1 zero byte 49 output = append(output, input...) 50 return output, patterns, uncovered 51 } 52 if trace { 53 fmt.Printf("Cluster | input = %x\n", input) 54 for _, match := range matches { 55 fmt.Printf(" [%x %d-%d]", input[match.Start:match.End], match.Start, match.End) 56 } 57 } 58 cellRing.Reset() 59 patterns = append(patterns[:0], 0, 0) // Sentinel entry - no meaning 60 lastF := matches[len(matches)-1] 61 for j := lastF.Start; j < lastF.End; j++ { 62 d := cellRing.PushBack() 63 d.optimStart = j + 1 64 d.coverStart = len(input) 65 d.compression = 0 66 d.patternIdx = 0 67 d.score = 0 68 } 69 // Starting from the last match 70 for i := len(matches); i > 0; i-- { 71 f := matches[i-1] 72 p := f.Val.(*Pattern) 73 firstCell := cellRing.Get(0) 74 maxCompression := firstCell.compression 75 maxScore := firstCell.score 76 maxCell := firstCell 77 var maxInclude bool 78 for e := 0; e < cellRing.Len(); e++ { 79 cell := cellRing.Get(e) 80 comp := cell.compression - 4 81 if cell.coverStart >= f.End { 82 comp += f.End - f.Start 83 } else { 84 comp += cell.coverStart - f.Start 85 } 86 score := cell.score + p.score 87 if comp > maxCompression || (comp == maxCompression && score > maxScore) { 88 maxCompression = comp 89 maxScore = score 90 maxInclude = true 91 maxCell = cell 92 } else if cell.optimStart > f.End { 93 cellRing.Truncate(e) 94 break 95 } 96 } 97 d := cellRing.PushFront() 98 d.optimStart = f.Start 99 d.score = maxScore 100 d.compression = maxCompression 101 if maxInclude { 102 if trace { 103 fmt.Printf("[include] cell for %d: with patterns", f.Start) 104 fmt.Printf(" [%x %d-%d]", input[f.Start:f.End], f.Start, f.End) 105 patternIdx := maxCell.patternIdx 106 for patternIdx != 0 { 107 pattern := patterns[patternIdx] 108 fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End) 109 patternIdx = patterns[patternIdx+1] 110 } 111 fmt.Printf("\n\n") 112 } 113 d.coverStart = f.Start 114 d.patternIdx = len(patterns) 115 patterns = append(patterns, i-1, maxCell.patternIdx) 116 } else { 117 if trace { 118 fmt.Printf("cell for %d: with patterns", f.Start) 119 patternIdx := maxCell.patternIdx 120 for patternIdx != 0 { 121 pattern := patterns[patternIdx] 122 fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End) 123 patternIdx = patterns[patternIdx+1] 124 } 125 fmt.Printf("\n\n") 126 } 127 d.coverStart = maxCell.coverStart 128 d.patternIdx = maxCell.patternIdx 129 } 130 } 131 optimCell := cellRing.Get(0) 132 if trace { 133 fmt.Printf("optimal =") 134 } 135 // Count number of patterns 136 var patternCount uint64 137 patternIdx := optimCell.patternIdx 138 for patternIdx != 0 { 139 patternCount++ 140 patternIdx = patterns[patternIdx+1] 141 } 142 var numBuf [binary.MaxVarintLen64]byte 143 p := binary.PutUvarint(numBuf[:], patternCount) 144 output = append(output, numBuf[:p]...) 145 patternIdx = optimCell.patternIdx 146 lastStart := 0 147 var lastUncovered int 148 uncovered = uncovered[:0] 149 for patternIdx != 0 { 150 pattern := patterns[patternIdx] 151 p := matches[pattern].Val.(*Pattern) 152 if trace { 153 fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End) 154 } 155 if matches[pattern].Start > lastUncovered { 156 uncovered = append(uncovered, lastUncovered, matches[pattern].Start) 157 } 158 lastUncovered = matches[pattern].End 159 // Starting position 160 posMap[uint64(matches[pattern].Start-lastStart+1)]++ 161 lastStart = matches[pattern].Start 162 n := binary.PutUvarint(numBuf[:], uint64(matches[pattern].Start)) 163 output = append(output, numBuf[:n]...) 164 // Code 165 n = binary.PutUvarint(numBuf[:], p.code) 166 output = append(output, numBuf[:n]...) 167 atomic.AddUint64(&p.uses, 1) 168 patternIdx = patterns[patternIdx+1] 169 } 170 if len(input) > lastUncovered { 171 uncovered = append(uncovered, lastUncovered, len(input)) 172 } 173 if trace { 174 fmt.Printf("\n\n") 175 } 176 // Add uncoded input 177 for i := 0; i < len(uncovered); i += 2 { 178 output = append(output, input[uncovered[i]:uncovered[i+1]]...) 179 } 180 return output, patterns, uncovered 181 } 182 183 func reduceDictWorker(trace bool, inputCh chan *CompressionWord, outCh chan *CompressionWord, completion *sync.WaitGroup, trie *patricia.PatriciaTree, inputSize, outputSize *atomic.Uint64, posMap map[uint64]uint64) { 184 defer completion.Done() 185 var output = make([]byte, 0, 256) 186 var uncovered = make([]int, 256) 187 var patterns = make([]int, 0, 256) 188 cellRing := NewRing() 189 mf2 := patricia.NewMatchFinder2(trie) 190 var numBuf [binary.MaxVarintLen64]byte 191 for compW := range inputCh { 192 wordLen := uint64(len(compW.word)) 193 n := binary.PutUvarint(numBuf[:], wordLen) 194 output = append(output[:0], numBuf[:n]...) // Prepend with the encoding of length 195 output, patterns, uncovered = optimiseCluster(trace, compW.word, mf2, output, uncovered, patterns, cellRing, posMap) 196 compW.word = append(compW.word[:0], output...) 197 outCh <- compW 198 inputSize.Add(1 + wordLen) 199 outputSize.Add(uint64(len(output))) 200 posMap[wordLen+1]++ 201 posMap[0]++ 202 } 203 } 204 205 // CompressionWord hold a word to be compressed (if flag is set), and the result of compression 206 // To allow multiple words to be processed concurrently, order field is used to collect all 207 // the words after processing without disrupting their order 208 type CompressionWord struct { 209 word []byte 210 order uint64 211 } 212 213 type CompressionQueue []*CompressionWord 214 215 func (cq CompressionQueue) Len() int { 216 return len(cq) 217 } 218 219 func (cq CompressionQueue) Less(i, j int) bool { 220 return cq[i].order < cq[j].order 221 } 222 223 func (cq *CompressionQueue) Swap(i, j int) { 224 (*cq)[i], (*cq)[j] = (*cq)[j], (*cq)[i] 225 } 226 227 func (cq *CompressionQueue) Push(x interface{}) { 228 *cq = append(*cq, x.(*CompressionWord)) 229 } 230 231 func (cq *CompressionQueue) Pop() interface{} { 232 old := *cq 233 n := len(old) 234 x := old[n-1] 235 old[n-1] = nil 236 *cq = old[0 : n-1] 237 return x 238 } 239 240 // reduceDict reduces the dictionary by trying the substitutions and counting frequency for each word 241 func reducedict(ctx context.Context, trace bool, logPrefix, segmentFilePath string, cf *os.File, datFile *DecompressedFile, workers int, dictBuilder *DictionaryBuilder, lvl log.Lvl, logger log.Logger) error { 242 logEvery := time.NewTicker(60 * time.Second) 243 defer logEvery.Stop() 244 245 // DictionaryBuilder is for sorting words by their freuency (to assign codes) 246 var pt patricia.PatriciaTree 247 code2pattern := make([]*Pattern, 0, 256) 248 dictBuilder.ForEach(func(score uint64, word []byte) { 249 p := &Pattern{ 250 score: score, 251 uses: 0, 252 code: uint64(len(code2pattern)), 253 codeBits: 0, 254 word: word, 255 } 256 pt.Insert(word, p) 257 code2pattern = append(code2pattern, p) 258 }) 259 dictBuilder.Close() 260 if lvl < log.LvlTrace { 261 logger.Log(lvl, fmt.Sprintf("[%s] dictionary file parsed", logPrefix), "entries", len(code2pattern)) 262 } 263 ch := make(chan *CompressionWord, 10_000) 264 inputSize, outputSize := &atomic.Uint64{}, &atomic.Uint64{} 265 266 var collectors []*etl.Collector 267 defer func() { 268 for _, c := range collectors { 269 c.Close() 270 } 271 }() 272 out := make(chan *CompressionWord, 1024) 273 var compressionQueue CompressionQueue 274 heap.Init(&compressionQueue) 275 queueLimit := 128 * 1024 276 277 // For the case of workers == 1 278 var output = make([]byte, 0, 256) 279 var uncovered = make([]int, 256) 280 var patterns = make([]int, 0, 256) 281 cellRing := NewRing() 282 mf2 := patricia.NewMatchFinder2(&pt) 283 284 var posMaps []map[uint64]uint64 285 uncompPosMap := make(map[uint64]uint64) // For the uncompressed words 286 posMaps = append(posMaps, uncompPosMap) 287 var wg sync.WaitGroup 288 if workers > 1 { 289 for i := 0; i < workers; i++ { 290 posMap := make(map[uint64]uint64) 291 posMaps = append(posMaps, posMap) 292 wg.Add(1) 293 go reduceDictWorker(trace, ch, out, &wg, &pt, inputSize, outputSize, posMap) 294 } 295 } 296 t := time.Now() 297 298 var err error 299 intermediatePath := segmentFilePath + ".tmp" 300 defer os.Remove(intermediatePath) 301 var intermediateFile *os.File 302 if intermediateFile, err = os.Create(intermediatePath); err != nil { 303 return fmt.Errorf("create intermediate file: %w", err) 304 } 305 defer intermediateFile.Close() 306 intermediateW := bufio.NewWriterSize(intermediateFile, 8*etl.BufIOSize) 307 308 var inCount, outCount, emptyWordsCount uint64 // Counters words sent to compression and returned for compression 309 var numBuf [binary.MaxVarintLen64]byte 310 totalWords := datFile.count 311 312 if err = datFile.ForEach(func(v []byte, compression bool) error { 313 select { 314 case <-ctx.Done(): 315 return ctx.Err() 316 default: 317 } 318 if workers > 1 { 319 // take processed words in non-blocking way and push them to the queue 320 outer: 321 for { 322 select { 323 case compW := <-out: 324 heap.Push(&compressionQueue, compW) 325 default: 326 break outer 327 } 328 } 329 // take processed words in blocking way until either: 330 // 1. compressionQueue is below the limit so that new words can be allocated 331 // 2. there is word in order on top of the queue which can be written down and reused 332 for compressionQueue.Len() >= queueLimit && compressionQueue[0].order < outCount { 333 // Blocking wait to receive some outputs until the top of queue can be processed 334 compW := <-out 335 heap.Push(&compressionQueue, compW) 336 } 337 var compW *CompressionWord 338 // Either take the word from the top, write it down and reuse for the next unprocessed word 339 // Or allocate new word 340 if compressionQueue.Len() > 0 && compressionQueue[0].order == outCount { 341 compW = heap.Pop(&compressionQueue).(*CompressionWord) 342 outCount++ 343 // Write to intermediate file 344 if _, e := intermediateW.Write(compW.word); e != nil { 345 return e 346 } 347 // Reuse compW for the next word 348 } else { 349 compW = &CompressionWord{} 350 } 351 compW.order = inCount 352 if len(v) == 0 { 353 // Empty word, cannot be compressed 354 compW.word = append(compW.word[:0], 0) 355 uncompPosMap[1]++ 356 uncompPosMap[0]++ 357 heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression 358 } else if compression { 359 compW.word = append(compW.word[:0], v...) 360 ch <- compW // Send for compression 361 } else { 362 // Prepend word with encoding of length + zero byte, which indicates no patterns to be found in this word 363 wordLen := uint64(len(v)) 364 n := binary.PutUvarint(numBuf[:], wordLen) 365 uncompPosMap[wordLen+1]++ 366 uncompPosMap[0]++ 367 compW.word = append(append(append(compW.word[:0], numBuf[:n]...), 0), v...) 368 heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression 369 } 370 } else { 371 outCount++ 372 wordLen := uint64(len(v)) 373 n := binary.PutUvarint(numBuf[:], wordLen) 374 if _, e := intermediateW.Write(numBuf[:n]); e != nil { 375 return e 376 } 377 if wordLen > 0 { 378 if compression { 379 output, patterns, uncovered = optimiseCluster(trace, v, mf2, output[:0], uncovered, patterns, cellRing, uncompPosMap) 380 if _, e := intermediateW.Write(output); e != nil { 381 return e 382 } 383 outputSize.Add(uint64(len(output))) 384 } else { 385 if e := intermediateW.WriteByte(0); e != nil { 386 return e 387 } 388 if _, e := intermediateW.Write(v); e != nil { 389 return e 390 } 391 outputSize.Add(1 + uint64(len(v))) 392 } 393 } 394 inputSize.Add(1 + wordLen) 395 uncompPosMap[wordLen+1]++ 396 uncompPosMap[0]++ 397 } 398 inCount++ 399 if len(v) == 0 { 400 emptyWordsCount++ 401 } 402 403 select { 404 case <-logEvery.C: 405 if lvl < log.LvlTrace { 406 logger.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(outCount)/float64(totalWords)), "ch", len(ch), "workers", workers) 407 } 408 default: 409 } 410 return nil 411 }); err != nil { 412 return err 413 } 414 close(ch) 415 // Drain the out queue if necessary 416 if inCount > outCount { 417 for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount { 418 compW := heap.Pop(&compressionQueue).(*CompressionWord) 419 outCount++ 420 if outCount == inCount { 421 close(out) 422 } 423 // Write to intermediate file 424 if _, e := intermediateW.Write(compW.word); e != nil { 425 return e 426 } 427 } 428 for compW := range out { 429 heap.Push(&compressionQueue, compW) 430 for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount { 431 compW = heap.Pop(&compressionQueue).(*CompressionWord) 432 outCount++ 433 if outCount == inCount { 434 close(out) 435 } 436 // Write to intermediate file 437 if _, e := intermediateW.Write(compW.word); e != nil { 438 return e 439 } 440 } 441 } 442 } 443 if err = intermediateW.Flush(); err != nil { 444 return err 445 } 446 wg.Wait() 447 if lvl < log.LvlTrace { 448 log.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "took", time.Since(t)) 449 } 450 if _, err = intermediateFile.Seek(0, 0); err != nil { 451 return fmt.Errorf("return to the start of intermediate file: %w", err) 452 } 453 454 //var m runtime.MemStats 455 //common.ReadMemStats(&m) 456 //logger.Info(fmt.Sprintf("[%s] Dictionary build done", logPrefix), "input", common.ByteCount(inputSize.Load()), "output", common.ByteCount(outputSize.Load()), "alloc", common.ByteCount(m.Alloc), "sys", common.ByteCount(m.Sys)) 457 posMap := make(map[uint64]uint64) 458 for _, m := range posMaps { 459 for l, c := range m { 460 posMap[l] += c 461 } 462 } 463 //fmt.Printf("posMap = %v\n", posMap) 464 var patternList PatternList 465 distribution := make([]int, maxPatternLen+1) 466 for _, p := range code2pattern { 467 if p.uses > 0 { 468 patternList = append(patternList, p) 469 distribution[len(p.word)]++ 470 } 471 } 472 slices.SortFunc(patternList, patternListLess) 473 logCtx := make([]interface{}, 0, 8) 474 logCtx = append(logCtx, "patternList.Len", patternList.Len()) 475 476 i := 0 477 // Build Huffman tree for codes 478 var codeHeap PatternHeap 479 heap.Init(&codeHeap) 480 tieBreaker := uint64(0) 481 for codeHeap.Len()+(patternList.Len()-i) > 1 { 482 // New node 483 h := &PatternHuff{ 484 tieBreaker: tieBreaker, 485 } 486 if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) { 487 // Take h0 from the heap 488 h.h0 = heap.Pop(&codeHeap).(*PatternHuff) 489 h.h0.AddZero() 490 h.uses += h.h0.uses 491 } else { 492 // Take p0 from the list 493 h.p0 = patternList[i] 494 h.p0.code = 0 495 h.p0.codeBits = 1 496 h.uses += h.p0.uses 497 i++ 498 } 499 if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) { 500 // Take h1 from the heap 501 h.h1 = heap.Pop(&codeHeap).(*PatternHuff) 502 h.h1.AddOne() 503 h.uses += h.h1.uses 504 } else { 505 // Take p1 from the list 506 h.p1 = patternList[i] 507 h.p1.code = 1 508 h.p1.codeBits = 1 509 h.uses += h.p1.uses 510 i++ 511 } 512 tieBreaker++ 513 heap.Push(&codeHeap, h) 514 } 515 if codeHeap.Len() > 0 { 516 root := heap.Pop(&codeHeap).(*PatternHuff) 517 root.SetDepth(0) 518 } 519 // Calculate total size of the dictionary 520 var patternsSize uint64 521 for _, p := range patternList { 522 ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) // Length of the word's depth 523 n := binary.PutUvarint(numBuf[:], uint64(len(p.word))) // Length of the word's length 524 patternsSize += uint64(ns + n + len(p.word)) 525 } 526 527 logCtx = append(logCtx, "patternsSize", common.ByteCount(patternsSize)) 528 for i, n := range distribution { 529 if n == 0 { 530 continue 531 } 532 logCtx = append(logCtx, fmt.Sprintf("%d", i), fmt.Sprintf("%d", n)) 533 } 534 if lvl < log.LvlTrace { 535 logger.Log(lvl, fmt.Sprintf("[%s] Effective dictionary", logPrefix), logCtx...) 536 } 537 cw := bufio.NewWriterSize(cf, 2*etl.BufIOSize) 538 // 1-st, output amount of words - just a useful metadata 539 binary.BigEndian.PutUint64(numBuf[:], inCount) // Dictionary size 540 if _, err = cw.Write(numBuf[:8]); err != nil { 541 return err 542 } 543 binary.BigEndian.PutUint64(numBuf[:], emptyWordsCount) 544 if _, err = cw.Write(numBuf[:8]); err != nil { 545 return err 546 } 547 // 2-nd, output dictionary size 548 binary.BigEndian.PutUint64(numBuf[:], patternsSize) // Dictionary size 549 if _, err = cw.Write(numBuf[:8]); err != nil { 550 return err 551 } 552 //fmt.Printf("patternsSize = %d\n", patternsSize) 553 // Write all the pattens 554 slices.SortFunc(patternList, patternListLess) 555 for _, p := range patternList { 556 ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) 557 if _, err = cw.Write(numBuf[:ns]); err != nil { 558 return err 559 } 560 n := binary.PutUvarint(numBuf[:], uint64(len(p.word))) 561 if _, err = cw.Write(numBuf[:n]); err != nil { 562 return err 563 } 564 if _, err = cw.Write(p.word); err != nil { 565 return err 566 } 567 //fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pattern=[%x]\n", p.depth, p.code, p.codeBits, p.word) 568 } 569 570 var positionList PositionList 571 pos2code := make(map[uint64]*Position) 572 for pos, uses := range posMap { 573 p := &Position{pos: pos, uses: uses, code: pos, codeBits: 0} 574 positionList = append(positionList, p) 575 pos2code[pos] = p 576 } 577 slices.SortFunc(positionList, positionListLess) 578 i = 0 579 // Build Huffman tree for codes 580 var posHeap PositionHeap 581 heap.Init(&posHeap) 582 tieBreaker = uint64(0) 583 for posHeap.Len()+(positionList.Len()-i) > 1 { 584 // New node 585 h := &PositionHuff{ 586 tieBreaker: tieBreaker, 587 } 588 if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) { 589 // Take h0 from the heap 590 h.h0 = heap.Pop(&posHeap).(*PositionHuff) 591 h.h0.AddZero() 592 h.uses += h.h0.uses 593 } else { 594 // Take p0 from the list 595 h.p0 = positionList[i] 596 h.p0.code = 0 597 h.p0.codeBits = 1 598 h.uses += h.p0.uses 599 i++ 600 } 601 if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) { 602 // Take h1 from the heap 603 h.h1 = heap.Pop(&posHeap).(*PositionHuff) 604 h.h1.AddOne() 605 h.uses += h.h1.uses 606 } else { 607 // Take p1 from the list 608 h.p1 = positionList[i] 609 h.p1.code = 1 610 h.p1.codeBits = 1 611 h.uses += h.p1.uses 612 i++ 613 } 614 tieBreaker++ 615 heap.Push(&posHeap, h) 616 } 617 if posHeap.Len() > 0 { 618 posRoot := heap.Pop(&posHeap).(*PositionHuff) 619 posRoot.SetDepth(0) 620 } 621 // Calculate the size of pos dictionary 622 var posSize uint64 623 for _, p := range positionList { 624 ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) // Length of the position's depth 625 n := binary.PutUvarint(numBuf[:], p.pos) 626 posSize += uint64(ns + n) 627 } 628 // First, output dictionary size 629 binary.BigEndian.PutUint64(numBuf[:], posSize) // Dictionary size 630 if _, err = cw.Write(numBuf[:8]); err != nil { 631 return err 632 } 633 //fmt.Printf("posSize = %d\n", posSize) 634 // Write all the positions 635 slices.SortFunc(positionList, positionListLess) 636 for _, p := range positionList { 637 ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) 638 if _, err = cw.Write(numBuf[:ns]); err != nil { 639 return err 640 } 641 n := binary.PutUvarint(numBuf[:], p.pos) 642 if _, err = cw.Write(numBuf[:n]); err != nil { 643 return err 644 } 645 //fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pos=%d\n", p.depth, p.code, p.codeBits, p.pos) 646 } 647 if lvl < log.LvlTrace { 648 logger.Log(lvl, fmt.Sprintf("[%s] Positional dictionary", logPrefix), "positionList.len", positionList.Len(), "posSize", common.ByteCount(posSize)) 649 } 650 // Re-encode all the words with the use of optimised (via Huffman coding) dictionaries 651 wc := 0 652 var hc HuffmanCoder 653 hc.w = cw 654 r := bufio.NewReaderSize(intermediateFile, 2*etl.BufIOSize) 655 var l uint64 656 var e error 657 for l, e = binary.ReadUvarint(r); e == nil; l, e = binary.ReadUvarint(r) { 658 posCode := pos2code[l+1] 659 if posCode != nil { 660 if e = hc.encode(posCode.code, posCode.codeBits); e != nil { 661 return e 662 } 663 } 664 if l == 0 { 665 if e = hc.flush(); e != nil { 666 return e 667 } 668 } else { 669 var pNum uint64 // Number of patterns 670 if pNum, e = binary.ReadUvarint(r); e != nil { 671 return e 672 } 673 // Now reading patterns one by one 674 var lastPos uint64 675 var lastUncovered int 676 var uncoveredCount int 677 for i := 0; i < int(pNum); i++ { 678 var pos uint64 // Starting position for pattern 679 if pos, e = binary.ReadUvarint(r); e != nil { 680 return e 681 } 682 posCode = pos2code[pos-lastPos+1] 683 lastPos = pos 684 if posCode != nil { 685 if e = hc.encode(posCode.code, posCode.codeBits); e != nil { 686 return e 687 } 688 } 689 var code uint64 // Code of the pattern 690 if code, e = binary.ReadUvarint(r); e != nil { 691 return e 692 } 693 patternCode := code2pattern[code] 694 if int(pos) > lastUncovered { 695 uncoveredCount += int(pos) - lastUncovered 696 } 697 lastUncovered = int(pos) + len(patternCode.word) 698 if patternCode != nil { 699 if e = hc.encode(patternCode.code, patternCode.codeBits); e != nil { 700 return e 701 } 702 } 703 } 704 if int(l) > lastUncovered { 705 uncoveredCount += int(l) - lastUncovered 706 } 707 // Terminating position and flush 708 posCode = pos2code[0] 709 if e = hc.encode(posCode.code, posCode.codeBits); e != nil { 710 return e 711 } 712 if e = hc.flush(); e != nil { 713 return e 714 } 715 // Copy uncovered characters 716 if uncoveredCount > 0 { 717 if _, e = io.CopyN(cw, r, int64(uncoveredCount)); e != nil { 718 return e 719 } 720 } 721 } 722 wc++ 723 select { 724 case <-logEvery.C: 725 if lvl < log.LvlTrace { 726 logger.Log(lvl, fmt.Sprintf("[%s] Compressed", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(wc)/float64(totalWords))) 727 } 728 default: 729 } 730 } 731 if e != nil && !errors.Is(e, io.EOF) { 732 return e 733 } 734 if err = intermediateFile.Close(); err != nil { 735 return err 736 } 737 if err = cw.Flush(); err != nil { 738 return err 739 } 740 return nil 741 } 742 743 // processSuperstring is the worker that processes one superstring and puts results 744 // into the collector, using lock to mutual exclusion. At the end (when the input channel is closed), 745 // it notifies the waitgroup before exiting, so that the caller known when all work is done 746 // No error channels for now 747 func processSuperstring(ctx context.Context, superstringCh chan []byte, dictCollector *etl.Collector, minPatternScore uint64, completion *sync.WaitGroup, logger log.Logger) { 748 defer completion.Done() 749 dictVal := make([]byte, 8) 750 dictKey := make([]byte, maxPatternLen) 751 var lcp, sa, inv []int32 752 for superstring := range superstringCh { 753 select { 754 case <-ctx.Done(): 755 return 756 default: 757 } 758 759 if cap(sa) < len(superstring) { 760 sa = make([]int32, len(superstring)) 761 } else { 762 sa = sa[:len(superstring)] 763 } 764 //log.Info("Superstring", "len", len(superstring)) 765 //start := time.Now() 766 if err := sais.Sais(superstring, sa); err != nil { 767 panic(err) 768 } 769 //log.Info("Suffix array built", "in", time.Since(start)) 770 // filter out suffixes that start with odd positions 771 n := len(sa) / 2 772 filtered := sa[:n] 773 //filtered := make([]int32, n) 774 var j int 775 for i := 0; i < len(sa); i++ { 776 if sa[i]&1 == 0 { 777 filtered[j] = sa[i] >> 1 778 j++ 779 } 780 } 781 // Now create an inverted array 782 if cap(inv) < n { 783 inv = make([]int32, n) 784 } else { 785 inv = inv[:n] 786 } 787 for i := 0; i < n; i++ { 788 inv[filtered[i]] = int32(i) 789 } 790 //logger.Info("Inverted array done") 791 var k int 792 // Process all suffixes one by one starting from 793 // first suffix in txt[] 794 if cap(lcp) < n { 795 lcp = make([]int32, n) 796 } else { 797 lcp = lcp[:n] 798 } 799 for i := 0; i < n; i++ { 800 /* If the current suffix is at n-1, then we don’t 801 have next substring to consider. So lcp is not 802 defined for this substring, we put zero. */ 803 if inv[i] == int32(n-1) { 804 k = 0 805 continue 806 } 807 808 /* j contains index of the next substring to 809 be considered to compare with the present 810 substring, i.e., next string in suffix array */ 811 j := int(filtered[inv[i]+1]) 812 813 // Directly start matching from k'th index as 814 // at-least k-1 characters will match 815 for i+k < n && j+k < n && superstring[(i+k)*2] != 0 && superstring[(j+k)*2] != 0 && superstring[(i+k)*2+1] == superstring[(j+k)*2+1] { 816 k++ 817 } 818 lcp[inv[i]] = int32(k) // lcp for the present suffix. 819 820 // Deleting the starting character from the string. 821 if k > 0 { 822 k-- 823 } 824 } 825 //log.Info("Kasai algorithm finished") 826 // Checking LCP array 827 828 if assert.Enable { 829 for i := 0; i < n-1; i++ { 830 var prefixLen int 831 p1 := int(filtered[i]) 832 p2 := int(filtered[i+1]) 833 for p1+prefixLen < n && 834 p2+prefixLen < n && 835 superstring[(p1+prefixLen)*2] != 0 && 836 superstring[(p2+prefixLen)*2] != 0 && 837 superstring[(p1+prefixLen)*2+1] == superstring[(p2+prefixLen)*2+1] { 838 prefixLen++ 839 } 840 if prefixLen != int(lcp[i]) { 841 logger.Error("Mismatch", "prefixLen", prefixLen, "lcp[i]", lcp[i], "i", i) 842 break 843 } 844 l := int(lcp[i]) // Length of potential dictionary word 845 if l < 2 { 846 continue 847 } 848 } 849 } 850 //logger.Info("LCP array checked") 851 // Walk over LCP array and compute the scores of the strings 852 var b = inv 853 j = 0 854 for i := 0; i < n-1; i++ { 855 // Only when there is a drop in LCP value 856 if lcp[i+1] >= lcp[i] { 857 j = i 858 continue 859 } 860 prevSkipped := false 861 for l := int(lcp[i]); l > int(lcp[i+1]) && l >= minPatternLen; l-- { 862 if l > maxPatternLen || 863 l > 20 && (l&(l-1)) != 0 { // is power of 2 864 prevSkipped = true 865 continue 866 } 867 868 // Go back 869 var isNew bool 870 for j > 0 && int(lcp[j-1]) >= l { 871 j-- 872 isNew = true 873 } 874 875 if !isNew && !prevSkipped { 876 break 877 } 878 879 window := i - j + 2 880 copy(b, filtered[j:i+2]) 881 slices.Sort(b[:window]) 882 repeats := 1 883 lastK := 0 884 for k := 1; k < window; k++ { 885 if b[k] >= b[lastK]+int32(l) { 886 repeats++ 887 lastK = k 888 } 889 } 890 891 if (l < 8 || l > 64) && repeats < int(minPatternScore) { 892 prevSkipped = true 893 continue 894 } 895 896 score := uint64(repeats * (l)) 897 if score < minPatternScore { 898 prevSkipped = true 899 continue 900 } 901 902 dictKey = dictKey[:l] 903 for s := 0; s < l; s++ { 904 dictKey[s] = superstring[(int(filtered[i])+s)*2+1] 905 } 906 binary.BigEndian.PutUint64(dictVal, score) 907 if err := dictCollector.Collect(dictKey, dictVal); err != nil { 908 logger.Error("processSuperstring", "collect", err) 909 } 910 prevSkipped = false //nolint 911 break 912 } 913 } 914 } 915 } 916 917 func DictionaryBuilderFromCollectors(ctx context.Context, logPrefix, tmpDir string, collectors []*etl.Collector, lvl log.Lvl, logger log.Logger) (*DictionaryBuilder, error) { 918 dictCollector := etl.NewCollector(logPrefix+"_collectDict", tmpDir, etl.NewSortableBuffer(etl.BufferOptimalSize), logger) 919 defer dictCollector.Close() 920 dictCollector.LogLvl(lvl) 921 922 dictAggregator := &DictAggregator{collector: dictCollector, dist: map[int]int{}} 923 for _, collector := range collectors { 924 if err := collector.Load(nil, "", dictAggregator.aggLoadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil { 925 return nil, err 926 } 927 collector.Close() 928 } 929 if err := dictAggregator.finish(); err != nil { 930 return nil, err 931 } 932 db := &DictionaryBuilder{limit: maxDictPatterns} // Only collect 1m words with highest scores 933 if err := dictCollector.Load(nil, "", db.loadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil { 934 return nil, err 935 } 936 db.finish() 937 938 db.Sort() 939 return db, nil 940 } 941 942 func PersistDictrionary(fileName string, db *DictionaryBuilder) error { 943 df, err := os.Create(fileName) 944 if err != nil { 945 return err 946 } 947 w := bufio.NewWriterSize(df, 2*etl.BufIOSize) 948 db.ForEach(func(score uint64, word []byte) { fmt.Fprintf(w, "%d %x\n", score, word) }) 949 if err = w.Flush(); err != nil { 950 return err 951 } 952 if err := df.Sync(); err != nil { 953 return err 954 } 955 return df.Close() 956 } 957 958 func ReadSimpleFile(fileName string, walker func(v []byte) error) error { 959 // Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values) 960 // We only consider values with length > 2, because smaller values are not compressible without going into bits 961 f, err := os.Open(fileName) 962 if err != nil { 963 return err 964 } 965 defer f.Close() 966 r := bufio.NewReaderSize(f, etl.BufIOSize) 967 buf := make([]byte, 4096) 968 for l, e := binary.ReadUvarint(r); ; l, e = binary.ReadUvarint(r) { 969 if e != nil { 970 if errors.Is(e, io.EOF) { 971 break 972 } 973 return e 974 } 975 if len(buf) < int(l) { 976 buf = make([]byte, l) 977 } 978 if _, e = io.ReadFull(r, buf[:l]); e != nil { 979 return e 980 } 981 if err := walker(buf[:l]); err != nil { 982 return err 983 } 984 } 985 return nil 986 }