github.com/ledgerwatch/erigon-lib@v1.0.0/compress/parallel_compress.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package compress
    18  
    19  import (
    20  	"bufio"
    21  	"container/heap"
    22  	"context"
    23  	"encoding/binary"
    24  	"errors"
    25  	"fmt"
    26  	"io"
    27  	"os"
    28  	"sync"
    29  	"sync/atomic"
    30  	"time"
    31  
    32  	"github.com/ledgerwatch/erigon-lib/common"
    33  	"github.com/ledgerwatch/erigon-lib/common/assert"
    34  	"github.com/ledgerwatch/erigon-lib/etl"
    35  	"github.com/ledgerwatch/erigon-lib/patricia"
    36  	"github.com/ledgerwatch/erigon-lib/sais"
    37  	"github.com/ledgerwatch/log/v3"
    38  	"golang.org/x/exp/slices"
    39  )
    40  
    41  // MinPatternScore is minimum score (per superstring) required to consider including pattern into the dictionary
    42  const MinPatternScore = 1024
    43  
    44  func optimiseCluster(trace bool, input []byte, mf2 *patricia.MatchFinder2, output []byte, uncovered []int, patterns []int, cellRing *Ring, posMap map[uint64]uint64) ([]byte, []int, []int) {
    45  	matches := mf2.FindLongestMatches(input)
    46  
    47  	if len(matches) == 0 {
    48  		output = append(output, 0) // Encoding of 0 in VarUint is 1 zero byte
    49  		output = append(output, input...)
    50  		return output, patterns, uncovered
    51  	}
    52  	if trace {
    53  		fmt.Printf("Cluster | input = %x\n", input)
    54  		for _, match := range matches {
    55  			fmt.Printf(" [%x %d-%d]", input[match.Start:match.End], match.Start, match.End)
    56  		}
    57  	}
    58  	cellRing.Reset()
    59  	patterns = append(patterns[:0], 0, 0) // Sentinel entry - no meaning
    60  	lastF := matches[len(matches)-1]
    61  	for j := lastF.Start; j < lastF.End; j++ {
    62  		d := cellRing.PushBack()
    63  		d.optimStart = j + 1
    64  		d.coverStart = len(input)
    65  		d.compression = 0
    66  		d.patternIdx = 0
    67  		d.score = 0
    68  	}
    69  	// Starting from the last match
    70  	for i := len(matches); i > 0; i-- {
    71  		f := matches[i-1]
    72  		p := f.Val.(*Pattern)
    73  		firstCell := cellRing.Get(0)
    74  		maxCompression := firstCell.compression
    75  		maxScore := firstCell.score
    76  		maxCell := firstCell
    77  		var maxInclude bool
    78  		for e := 0; e < cellRing.Len(); e++ {
    79  			cell := cellRing.Get(e)
    80  			comp := cell.compression - 4
    81  			if cell.coverStart >= f.End {
    82  				comp += f.End - f.Start
    83  			} else {
    84  				comp += cell.coverStart - f.Start
    85  			}
    86  			score := cell.score + p.score
    87  			if comp > maxCompression || (comp == maxCompression && score > maxScore) {
    88  				maxCompression = comp
    89  				maxScore = score
    90  				maxInclude = true
    91  				maxCell = cell
    92  			} else if cell.optimStart > f.End {
    93  				cellRing.Truncate(e)
    94  				break
    95  			}
    96  		}
    97  		d := cellRing.PushFront()
    98  		d.optimStart = f.Start
    99  		d.score = maxScore
   100  		d.compression = maxCompression
   101  		if maxInclude {
   102  			if trace {
   103  				fmt.Printf("[include] cell for %d: with patterns", f.Start)
   104  				fmt.Printf(" [%x %d-%d]", input[f.Start:f.End], f.Start, f.End)
   105  				patternIdx := maxCell.patternIdx
   106  				for patternIdx != 0 {
   107  					pattern := patterns[patternIdx]
   108  					fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
   109  					patternIdx = patterns[patternIdx+1]
   110  				}
   111  				fmt.Printf("\n\n")
   112  			}
   113  			d.coverStart = f.Start
   114  			d.patternIdx = len(patterns)
   115  			patterns = append(patterns, i-1, maxCell.patternIdx)
   116  		} else {
   117  			if trace {
   118  				fmt.Printf("cell for %d: with patterns", f.Start)
   119  				patternIdx := maxCell.patternIdx
   120  				for patternIdx != 0 {
   121  					pattern := patterns[patternIdx]
   122  					fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
   123  					patternIdx = patterns[patternIdx+1]
   124  				}
   125  				fmt.Printf("\n\n")
   126  			}
   127  			d.coverStart = maxCell.coverStart
   128  			d.patternIdx = maxCell.patternIdx
   129  		}
   130  	}
   131  	optimCell := cellRing.Get(0)
   132  	if trace {
   133  		fmt.Printf("optimal =")
   134  	}
   135  	// Count number of patterns
   136  	var patternCount uint64
   137  	patternIdx := optimCell.patternIdx
   138  	for patternIdx != 0 {
   139  		patternCount++
   140  		patternIdx = patterns[patternIdx+1]
   141  	}
   142  	var numBuf [binary.MaxVarintLen64]byte
   143  	p := binary.PutUvarint(numBuf[:], patternCount)
   144  	output = append(output, numBuf[:p]...)
   145  	patternIdx = optimCell.patternIdx
   146  	lastStart := 0
   147  	var lastUncovered int
   148  	uncovered = uncovered[:0]
   149  	for patternIdx != 0 {
   150  		pattern := patterns[patternIdx]
   151  		p := matches[pattern].Val.(*Pattern)
   152  		if trace {
   153  			fmt.Printf(" [%x %d-%d]", input[matches[pattern].Start:matches[pattern].End], matches[pattern].Start, matches[pattern].End)
   154  		}
   155  		if matches[pattern].Start > lastUncovered {
   156  			uncovered = append(uncovered, lastUncovered, matches[pattern].Start)
   157  		}
   158  		lastUncovered = matches[pattern].End
   159  		// Starting position
   160  		posMap[uint64(matches[pattern].Start-lastStart+1)]++
   161  		lastStart = matches[pattern].Start
   162  		n := binary.PutUvarint(numBuf[:], uint64(matches[pattern].Start))
   163  		output = append(output, numBuf[:n]...)
   164  		// Code
   165  		n = binary.PutUvarint(numBuf[:], p.code)
   166  		output = append(output, numBuf[:n]...)
   167  		atomic.AddUint64(&p.uses, 1)
   168  		patternIdx = patterns[patternIdx+1]
   169  	}
   170  	if len(input) > lastUncovered {
   171  		uncovered = append(uncovered, lastUncovered, len(input))
   172  	}
   173  	if trace {
   174  		fmt.Printf("\n\n")
   175  	}
   176  	// Add uncoded input
   177  	for i := 0; i < len(uncovered); i += 2 {
   178  		output = append(output, input[uncovered[i]:uncovered[i+1]]...)
   179  	}
   180  	return output, patterns, uncovered
   181  }
   182  
   183  func reduceDictWorker(trace bool, inputCh chan *CompressionWord, outCh chan *CompressionWord, completion *sync.WaitGroup, trie *patricia.PatriciaTree, inputSize, outputSize *atomic.Uint64, posMap map[uint64]uint64) {
   184  	defer completion.Done()
   185  	var output = make([]byte, 0, 256)
   186  	var uncovered = make([]int, 256)
   187  	var patterns = make([]int, 0, 256)
   188  	cellRing := NewRing()
   189  	mf2 := patricia.NewMatchFinder2(trie)
   190  	var numBuf [binary.MaxVarintLen64]byte
   191  	for compW := range inputCh {
   192  		wordLen := uint64(len(compW.word))
   193  		n := binary.PutUvarint(numBuf[:], wordLen)
   194  		output = append(output[:0], numBuf[:n]...) // Prepend with the encoding of length
   195  		output, patterns, uncovered = optimiseCluster(trace, compW.word, mf2, output, uncovered, patterns, cellRing, posMap)
   196  		compW.word = append(compW.word[:0], output...)
   197  		outCh <- compW
   198  		inputSize.Add(1 + wordLen)
   199  		outputSize.Add(uint64(len(output)))
   200  		posMap[wordLen+1]++
   201  		posMap[0]++
   202  	}
   203  }
   204  
   205  // CompressionWord hold a word to be compressed (if flag is set), and the result of compression
   206  // To allow multiple words to be processed concurrently, order field is used to collect all
   207  // the words after processing without disrupting their order
   208  type CompressionWord struct {
   209  	word  []byte
   210  	order uint64
   211  }
   212  
   213  type CompressionQueue []*CompressionWord
   214  
   215  func (cq CompressionQueue) Len() int {
   216  	return len(cq)
   217  }
   218  
   219  func (cq CompressionQueue) Less(i, j int) bool {
   220  	return cq[i].order < cq[j].order
   221  }
   222  
   223  func (cq *CompressionQueue) Swap(i, j int) {
   224  	(*cq)[i], (*cq)[j] = (*cq)[j], (*cq)[i]
   225  }
   226  
   227  func (cq *CompressionQueue) Push(x interface{}) {
   228  	*cq = append(*cq, x.(*CompressionWord))
   229  }
   230  
   231  func (cq *CompressionQueue) Pop() interface{} {
   232  	old := *cq
   233  	n := len(old)
   234  	x := old[n-1]
   235  	old[n-1] = nil
   236  	*cq = old[0 : n-1]
   237  	return x
   238  }
   239  
   240  // reduceDict reduces the dictionary by trying the substitutions and counting frequency for each word
   241  func reducedict(ctx context.Context, trace bool, logPrefix, segmentFilePath string, cf *os.File, datFile *DecompressedFile, workers int, dictBuilder *DictionaryBuilder, lvl log.Lvl, logger log.Logger) error {
   242  	logEvery := time.NewTicker(60 * time.Second)
   243  	defer logEvery.Stop()
   244  
   245  	// DictionaryBuilder is for sorting words by their freuency (to assign codes)
   246  	var pt patricia.PatriciaTree
   247  	code2pattern := make([]*Pattern, 0, 256)
   248  	dictBuilder.ForEach(func(score uint64, word []byte) {
   249  		p := &Pattern{
   250  			score:    score,
   251  			uses:     0,
   252  			code:     uint64(len(code2pattern)),
   253  			codeBits: 0,
   254  			word:     word,
   255  		}
   256  		pt.Insert(word, p)
   257  		code2pattern = append(code2pattern, p)
   258  	})
   259  	dictBuilder.Close()
   260  	if lvl < log.LvlTrace {
   261  		logger.Log(lvl, fmt.Sprintf("[%s] dictionary file parsed", logPrefix), "entries", len(code2pattern))
   262  	}
   263  	ch := make(chan *CompressionWord, 10_000)
   264  	inputSize, outputSize := &atomic.Uint64{}, &atomic.Uint64{}
   265  
   266  	var collectors []*etl.Collector
   267  	defer func() {
   268  		for _, c := range collectors {
   269  			c.Close()
   270  		}
   271  	}()
   272  	out := make(chan *CompressionWord, 1024)
   273  	var compressionQueue CompressionQueue
   274  	heap.Init(&compressionQueue)
   275  	queueLimit := 128 * 1024
   276  
   277  	// For the case of workers == 1
   278  	var output = make([]byte, 0, 256)
   279  	var uncovered = make([]int, 256)
   280  	var patterns = make([]int, 0, 256)
   281  	cellRing := NewRing()
   282  	mf2 := patricia.NewMatchFinder2(&pt)
   283  
   284  	var posMaps []map[uint64]uint64
   285  	uncompPosMap := make(map[uint64]uint64) // For the uncompressed words
   286  	posMaps = append(posMaps, uncompPosMap)
   287  	var wg sync.WaitGroup
   288  	if workers > 1 {
   289  		for i := 0; i < workers; i++ {
   290  			posMap := make(map[uint64]uint64)
   291  			posMaps = append(posMaps, posMap)
   292  			wg.Add(1)
   293  			go reduceDictWorker(trace, ch, out, &wg, &pt, inputSize, outputSize, posMap)
   294  		}
   295  	}
   296  	t := time.Now()
   297  
   298  	var err error
   299  	intermediatePath := segmentFilePath + ".tmp"
   300  	defer os.Remove(intermediatePath)
   301  	var intermediateFile *os.File
   302  	if intermediateFile, err = os.Create(intermediatePath); err != nil {
   303  		return fmt.Errorf("create intermediate file: %w", err)
   304  	}
   305  	defer intermediateFile.Close()
   306  	intermediateW := bufio.NewWriterSize(intermediateFile, 8*etl.BufIOSize)
   307  
   308  	var inCount, outCount, emptyWordsCount uint64 // Counters words sent to compression and returned for compression
   309  	var numBuf [binary.MaxVarintLen64]byte
   310  	totalWords := datFile.count
   311  
   312  	if err = datFile.ForEach(func(v []byte, compression bool) error {
   313  		select {
   314  		case <-ctx.Done():
   315  			return ctx.Err()
   316  		default:
   317  		}
   318  		if workers > 1 {
   319  			// take processed words in non-blocking way and push them to the queue
   320  		outer:
   321  			for {
   322  				select {
   323  				case compW := <-out:
   324  					heap.Push(&compressionQueue, compW)
   325  				default:
   326  					break outer
   327  				}
   328  			}
   329  			// take processed words in blocking way until either:
   330  			// 1. compressionQueue is below the limit so that new words can be allocated
   331  			// 2. there is word in order on top of the queue which can be written down and reused
   332  			for compressionQueue.Len() >= queueLimit && compressionQueue[0].order < outCount {
   333  				// Blocking wait to receive some outputs until the top of queue can be processed
   334  				compW := <-out
   335  				heap.Push(&compressionQueue, compW)
   336  			}
   337  			var compW *CompressionWord
   338  			// Either take the word from the top, write it down and reuse for the next unprocessed word
   339  			// Or allocate new word
   340  			if compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
   341  				compW = heap.Pop(&compressionQueue).(*CompressionWord)
   342  				outCount++
   343  				// Write to intermediate file
   344  				if _, e := intermediateW.Write(compW.word); e != nil {
   345  					return e
   346  				}
   347  				// Reuse compW for the next word
   348  			} else {
   349  				compW = &CompressionWord{}
   350  			}
   351  			compW.order = inCount
   352  			if len(v) == 0 {
   353  				// Empty word, cannot be compressed
   354  				compW.word = append(compW.word[:0], 0)
   355  				uncompPosMap[1]++
   356  				uncompPosMap[0]++
   357  				heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression
   358  			} else if compression {
   359  				compW.word = append(compW.word[:0], v...)
   360  				ch <- compW // Send for compression
   361  			} else {
   362  				// Prepend word with encoding of length + zero byte, which indicates no patterns to be found in this word
   363  				wordLen := uint64(len(v))
   364  				n := binary.PutUvarint(numBuf[:], wordLen)
   365  				uncompPosMap[wordLen+1]++
   366  				uncompPosMap[0]++
   367  				compW.word = append(append(append(compW.word[:0], numBuf[:n]...), 0), v...)
   368  				heap.Push(&compressionQueue, compW) // Push to the queue directly, bypassing compression
   369  			}
   370  		} else {
   371  			outCount++
   372  			wordLen := uint64(len(v))
   373  			n := binary.PutUvarint(numBuf[:], wordLen)
   374  			if _, e := intermediateW.Write(numBuf[:n]); e != nil {
   375  				return e
   376  			}
   377  			if wordLen > 0 {
   378  				if compression {
   379  					output, patterns, uncovered = optimiseCluster(trace, v, mf2, output[:0], uncovered, patterns, cellRing, uncompPosMap)
   380  					if _, e := intermediateW.Write(output); e != nil {
   381  						return e
   382  					}
   383  					outputSize.Add(uint64(len(output)))
   384  				} else {
   385  					if e := intermediateW.WriteByte(0); e != nil {
   386  						return e
   387  					}
   388  					if _, e := intermediateW.Write(v); e != nil {
   389  						return e
   390  					}
   391  					outputSize.Add(1 + uint64(len(v)))
   392  				}
   393  			}
   394  			inputSize.Add(1 + wordLen)
   395  			uncompPosMap[wordLen+1]++
   396  			uncompPosMap[0]++
   397  		}
   398  		inCount++
   399  		if len(v) == 0 {
   400  			emptyWordsCount++
   401  		}
   402  
   403  		select {
   404  		case <-logEvery.C:
   405  			if lvl < log.LvlTrace {
   406  				logger.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(outCount)/float64(totalWords)), "ch", len(ch), "workers", workers)
   407  			}
   408  		default:
   409  		}
   410  		return nil
   411  	}); err != nil {
   412  		return err
   413  	}
   414  	close(ch)
   415  	// Drain the out queue if necessary
   416  	if inCount > outCount {
   417  		for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
   418  			compW := heap.Pop(&compressionQueue).(*CompressionWord)
   419  			outCount++
   420  			if outCount == inCount {
   421  				close(out)
   422  			}
   423  			// Write to intermediate file
   424  			if _, e := intermediateW.Write(compW.word); e != nil {
   425  				return e
   426  			}
   427  		}
   428  		for compW := range out {
   429  			heap.Push(&compressionQueue, compW)
   430  			for compressionQueue.Len() > 0 && compressionQueue[0].order == outCount {
   431  				compW = heap.Pop(&compressionQueue).(*CompressionWord)
   432  				outCount++
   433  				if outCount == inCount {
   434  					close(out)
   435  				}
   436  				// Write to intermediate file
   437  				if _, e := intermediateW.Write(compW.word); e != nil {
   438  					return e
   439  				}
   440  			}
   441  		}
   442  	}
   443  	if err = intermediateW.Flush(); err != nil {
   444  		return err
   445  	}
   446  	wg.Wait()
   447  	if lvl < log.LvlTrace {
   448  		log.Log(lvl, fmt.Sprintf("[%s] Replacement preprocessing", logPrefix), "took", time.Since(t))
   449  	}
   450  	if _, err = intermediateFile.Seek(0, 0); err != nil {
   451  		return fmt.Errorf("return to the start of intermediate file: %w", err)
   452  	}
   453  
   454  	//var m runtime.MemStats
   455  	//common.ReadMemStats(&m)
   456  	//logger.Info(fmt.Sprintf("[%s] Dictionary build done", logPrefix), "input", common.ByteCount(inputSize.Load()), "output", common.ByteCount(outputSize.Load()), "alloc", common.ByteCount(m.Alloc), "sys", common.ByteCount(m.Sys))
   457  	posMap := make(map[uint64]uint64)
   458  	for _, m := range posMaps {
   459  		for l, c := range m {
   460  			posMap[l] += c
   461  		}
   462  	}
   463  	//fmt.Printf("posMap = %v\n", posMap)
   464  	var patternList PatternList
   465  	distribution := make([]int, maxPatternLen+1)
   466  	for _, p := range code2pattern {
   467  		if p.uses > 0 {
   468  			patternList = append(patternList, p)
   469  			distribution[len(p.word)]++
   470  		}
   471  	}
   472  	slices.SortFunc(patternList, patternListLess)
   473  	logCtx := make([]interface{}, 0, 8)
   474  	logCtx = append(logCtx, "patternList.Len", patternList.Len())
   475  
   476  	i := 0
   477  	// Build Huffman tree for codes
   478  	var codeHeap PatternHeap
   479  	heap.Init(&codeHeap)
   480  	tieBreaker := uint64(0)
   481  	for codeHeap.Len()+(patternList.Len()-i) > 1 {
   482  		// New node
   483  		h := &PatternHuff{
   484  			tieBreaker: tieBreaker,
   485  		}
   486  		if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) {
   487  			// Take h0 from the heap
   488  			h.h0 = heap.Pop(&codeHeap).(*PatternHuff)
   489  			h.h0.AddZero()
   490  			h.uses += h.h0.uses
   491  		} else {
   492  			// Take p0 from the list
   493  			h.p0 = patternList[i]
   494  			h.p0.code = 0
   495  			h.p0.codeBits = 1
   496  			h.uses += h.p0.uses
   497  			i++
   498  		}
   499  		if codeHeap.Len() > 0 && (i >= patternList.Len() || codeHeap[0].uses < patternList[i].uses) {
   500  			// Take h1 from the heap
   501  			h.h1 = heap.Pop(&codeHeap).(*PatternHuff)
   502  			h.h1.AddOne()
   503  			h.uses += h.h1.uses
   504  		} else {
   505  			// Take p1 from the list
   506  			h.p1 = patternList[i]
   507  			h.p1.code = 1
   508  			h.p1.codeBits = 1
   509  			h.uses += h.p1.uses
   510  			i++
   511  		}
   512  		tieBreaker++
   513  		heap.Push(&codeHeap, h)
   514  	}
   515  	if codeHeap.Len() > 0 {
   516  		root := heap.Pop(&codeHeap).(*PatternHuff)
   517  		root.SetDepth(0)
   518  	}
   519  	// Calculate total size of the dictionary
   520  	var patternsSize uint64
   521  	for _, p := range patternList {
   522  		ns := binary.PutUvarint(numBuf[:], uint64(p.depth))    // Length of the word's depth
   523  		n := binary.PutUvarint(numBuf[:], uint64(len(p.word))) // Length of the word's length
   524  		patternsSize += uint64(ns + n + len(p.word))
   525  	}
   526  
   527  	logCtx = append(logCtx, "patternsSize", common.ByteCount(patternsSize))
   528  	for i, n := range distribution {
   529  		if n == 0 {
   530  			continue
   531  		}
   532  		logCtx = append(logCtx, fmt.Sprintf("%d", i), fmt.Sprintf("%d", n))
   533  	}
   534  	if lvl < log.LvlTrace {
   535  		logger.Log(lvl, fmt.Sprintf("[%s] Effective dictionary", logPrefix), logCtx...)
   536  	}
   537  	cw := bufio.NewWriterSize(cf, 2*etl.BufIOSize)
   538  	// 1-st, output amount of words - just a useful metadata
   539  	binary.BigEndian.PutUint64(numBuf[:], inCount) // Dictionary size
   540  	if _, err = cw.Write(numBuf[:8]); err != nil {
   541  		return err
   542  	}
   543  	binary.BigEndian.PutUint64(numBuf[:], emptyWordsCount)
   544  	if _, err = cw.Write(numBuf[:8]); err != nil {
   545  		return err
   546  	}
   547  	// 2-nd, output dictionary size
   548  	binary.BigEndian.PutUint64(numBuf[:], patternsSize) // Dictionary size
   549  	if _, err = cw.Write(numBuf[:8]); err != nil {
   550  		return err
   551  	}
   552  	//fmt.Printf("patternsSize = %d\n", patternsSize)
   553  	// Write all the pattens
   554  	slices.SortFunc(patternList, patternListLess)
   555  	for _, p := range patternList {
   556  		ns := binary.PutUvarint(numBuf[:], uint64(p.depth))
   557  		if _, err = cw.Write(numBuf[:ns]); err != nil {
   558  			return err
   559  		}
   560  		n := binary.PutUvarint(numBuf[:], uint64(len(p.word)))
   561  		if _, err = cw.Write(numBuf[:n]); err != nil {
   562  			return err
   563  		}
   564  		if _, err = cw.Write(p.word); err != nil {
   565  			return err
   566  		}
   567  		//fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pattern=[%x]\n", p.depth, p.code, p.codeBits, p.word)
   568  	}
   569  
   570  	var positionList PositionList
   571  	pos2code := make(map[uint64]*Position)
   572  	for pos, uses := range posMap {
   573  		p := &Position{pos: pos, uses: uses, code: pos, codeBits: 0}
   574  		positionList = append(positionList, p)
   575  		pos2code[pos] = p
   576  	}
   577  	slices.SortFunc(positionList, positionListLess)
   578  	i = 0
   579  	// Build Huffman tree for codes
   580  	var posHeap PositionHeap
   581  	heap.Init(&posHeap)
   582  	tieBreaker = uint64(0)
   583  	for posHeap.Len()+(positionList.Len()-i) > 1 {
   584  		// New node
   585  		h := &PositionHuff{
   586  			tieBreaker: tieBreaker,
   587  		}
   588  		if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) {
   589  			// Take h0 from the heap
   590  			h.h0 = heap.Pop(&posHeap).(*PositionHuff)
   591  			h.h0.AddZero()
   592  			h.uses += h.h0.uses
   593  		} else {
   594  			// Take p0 from the list
   595  			h.p0 = positionList[i]
   596  			h.p0.code = 0
   597  			h.p0.codeBits = 1
   598  			h.uses += h.p0.uses
   599  			i++
   600  		}
   601  		if posHeap.Len() > 0 && (i >= positionList.Len() || posHeap[0].uses < positionList[i].uses) {
   602  			// Take h1 from the heap
   603  			h.h1 = heap.Pop(&posHeap).(*PositionHuff)
   604  			h.h1.AddOne()
   605  			h.uses += h.h1.uses
   606  		} else {
   607  			// Take p1 from the list
   608  			h.p1 = positionList[i]
   609  			h.p1.code = 1
   610  			h.p1.codeBits = 1
   611  			h.uses += h.p1.uses
   612  			i++
   613  		}
   614  		tieBreaker++
   615  		heap.Push(&posHeap, h)
   616  	}
   617  	if posHeap.Len() > 0 {
   618  		posRoot := heap.Pop(&posHeap).(*PositionHuff)
   619  		posRoot.SetDepth(0)
   620  	}
   621  	// Calculate the size of pos dictionary
   622  	var posSize uint64
   623  	for _, p := range positionList {
   624  		ns := binary.PutUvarint(numBuf[:], uint64(p.depth)) // Length of the position's depth
   625  		n := binary.PutUvarint(numBuf[:], p.pos)
   626  		posSize += uint64(ns + n)
   627  	}
   628  	// First, output dictionary size
   629  	binary.BigEndian.PutUint64(numBuf[:], posSize) // Dictionary size
   630  	if _, err = cw.Write(numBuf[:8]); err != nil {
   631  		return err
   632  	}
   633  	//fmt.Printf("posSize = %d\n", posSize)
   634  	// Write all the positions
   635  	slices.SortFunc(positionList, positionListLess)
   636  	for _, p := range positionList {
   637  		ns := binary.PutUvarint(numBuf[:], uint64(p.depth))
   638  		if _, err = cw.Write(numBuf[:ns]); err != nil {
   639  			return err
   640  		}
   641  		n := binary.PutUvarint(numBuf[:], p.pos)
   642  		if _, err = cw.Write(numBuf[:n]); err != nil {
   643  			return err
   644  		}
   645  		//fmt.Printf("[comp] depth=%d, code=[%b], codeLen=%d pos=%d\n", p.depth, p.code, p.codeBits, p.pos)
   646  	}
   647  	if lvl < log.LvlTrace {
   648  		logger.Log(lvl, fmt.Sprintf("[%s] Positional dictionary", logPrefix), "positionList.len", positionList.Len(), "posSize", common.ByteCount(posSize))
   649  	}
   650  	// Re-encode all the words with the use of optimised (via Huffman coding) dictionaries
   651  	wc := 0
   652  	var hc HuffmanCoder
   653  	hc.w = cw
   654  	r := bufio.NewReaderSize(intermediateFile, 2*etl.BufIOSize)
   655  	var l uint64
   656  	var e error
   657  	for l, e = binary.ReadUvarint(r); e == nil; l, e = binary.ReadUvarint(r) {
   658  		posCode := pos2code[l+1]
   659  		if posCode != nil {
   660  			if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
   661  				return e
   662  			}
   663  		}
   664  		if l == 0 {
   665  			if e = hc.flush(); e != nil {
   666  				return e
   667  			}
   668  		} else {
   669  			var pNum uint64 // Number of patterns
   670  			if pNum, e = binary.ReadUvarint(r); e != nil {
   671  				return e
   672  			}
   673  			// Now reading patterns one by one
   674  			var lastPos uint64
   675  			var lastUncovered int
   676  			var uncoveredCount int
   677  			for i := 0; i < int(pNum); i++ {
   678  				var pos uint64 // Starting position for pattern
   679  				if pos, e = binary.ReadUvarint(r); e != nil {
   680  					return e
   681  				}
   682  				posCode = pos2code[pos-lastPos+1]
   683  				lastPos = pos
   684  				if posCode != nil {
   685  					if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
   686  						return e
   687  					}
   688  				}
   689  				var code uint64 // Code of the pattern
   690  				if code, e = binary.ReadUvarint(r); e != nil {
   691  					return e
   692  				}
   693  				patternCode := code2pattern[code]
   694  				if int(pos) > lastUncovered {
   695  					uncoveredCount += int(pos) - lastUncovered
   696  				}
   697  				lastUncovered = int(pos) + len(patternCode.word)
   698  				if patternCode != nil {
   699  					if e = hc.encode(patternCode.code, patternCode.codeBits); e != nil {
   700  						return e
   701  					}
   702  				}
   703  			}
   704  			if int(l) > lastUncovered {
   705  				uncoveredCount += int(l) - lastUncovered
   706  			}
   707  			// Terminating position and flush
   708  			posCode = pos2code[0]
   709  			if e = hc.encode(posCode.code, posCode.codeBits); e != nil {
   710  				return e
   711  			}
   712  			if e = hc.flush(); e != nil {
   713  				return e
   714  			}
   715  			// Copy uncovered characters
   716  			if uncoveredCount > 0 {
   717  				if _, e = io.CopyN(cw, r, int64(uncoveredCount)); e != nil {
   718  					return e
   719  				}
   720  			}
   721  		}
   722  		wc++
   723  		select {
   724  		case <-logEvery.C:
   725  			if lvl < log.LvlTrace {
   726  				logger.Log(lvl, fmt.Sprintf("[%s] Compressed", logPrefix), "processed", fmt.Sprintf("%.2f%%", 100*float64(wc)/float64(totalWords)))
   727  			}
   728  		default:
   729  		}
   730  	}
   731  	if e != nil && !errors.Is(e, io.EOF) {
   732  		return e
   733  	}
   734  	if err = intermediateFile.Close(); err != nil {
   735  		return err
   736  	}
   737  	if err = cw.Flush(); err != nil {
   738  		return err
   739  	}
   740  	return nil
   741  }
   742  
   743  // processSuperstring is the worker that processes one superstring and puts results
   744  // into the collector, using lock to mutual exclusion. At the end (when the input channel is closed),
   745  // it notifies the waitgroup before exiting, so that the caller known when all work is done
   746  // No error channels for now
   747  func processSuperstring(ctx context.Context, superstringCh chan []byte, dictCollector *etl.Collector, minPatternScore uint64, completion *sync.WaitGroup, logger log.Logger) {
   748  	defer completion.Done()
   749  	dictVal := make([]byte, 8)
   750  	dictKey := make([]byte, maxPatternLen)
   751  	var lcp, sa, inv []int32
   752  	for superstring := range superstringCh {
   753  		select {
   754  		case <-ctx.Done():
   755  			return
   756  		default:
   757  		}
   758  
   759  		if cap(sa) < len(superstring) {
   760  			sa = make([]int32, len(superstring))
   761  		} else {
   762  			sa = sa[:len(superstring)]
   763  		}
   764  		//log.Info("Superstring", "len", len(superstring))
   765  		//start := time.Now()
   766  		if err := sais.Sais(superstring, sa); err != nil {
   767  			panic(err)
   768  		}
   769  		//log.Info("Suffix array built", "in", time.Since(start))
   770  		// filter out suffixes that start with odd positions
   771  		n := len(sa) / 2
   772  		filtered := sa[:n]
   773  		//filtered := make([]int32, n)
   774  		var j int
   775  		for i := 0; i < len(sa); i++ {
   776  			if sa[i]&1 == 0 {
   777  				filtered[j] = sa[i] >> 1
   778  				j++
   779  			}
   780  		}
   781  		// Now create an inverted array
   782  		if cap(inv) < n {
   783  			inv = make([]int32, n)
   784  		} else {
   785  			inv = inv[:n]
   786  		}
   787  		for i := 0; i < n; i++ {
   788  			inv[filtered[i]] = int32(i)
   789  		}
   790  		//logger.Info("Inverted array done")
   791  		var k int
   792  		// Process all suffixes one by one starting from
   793  		// first suffix in txt[]
   794  		if cap(lcp) < n {
   795  			lcp = make([]int32, n)
   796  		} else {
   797  			lcp = lcp[:n]
   798  		}
   799  		for i := 0; i < n; i++ {
   800  			/* If the current suffix is at n-1, then we don’t
   801  			   have next substring to consider. So lcp is not
   802  			   defined for this substring, we put zero. */
   803  			if inv[i] == int32(n-1) {
   804  				k = 0
   805  				continue
   806  			}
   807  
   808  			/* j contains index of the next substring to
   809  			   be considered  to compare with the present
   810  			   substring, i.e., next string in suffix array */
   811  			j := int(filtered[inv[i]+1])
   812  
   813  			// Directly start matching from k'th index as
   814  			// at-least k-1 characters will match
   815  			for i+k < n && j+k < n && superstring[(i+k)*2] != 0 && superstring[(j+k)*2] != 0 && superstring[(i+k)*2+1] == superstring[(j+k)*2+1] {
   816  				k++
   817  			}
   818  			lcp[inv[i]] = int32(k) // lcp for the present suffix.
   819  
   820  			// Deleting the starting character from the string.
   821  			if k > 0 {
   822  				k--
   823  			}
   824  		}
   825  		//log.Info("Kasai algorithm finished")
   826  		// Checking LCP array
   827  
   828  		if assert.Enable {
   829  			for i := 0; i < n-1; i++ {
   830  				var prefixLen int
   831  				p1 := int(filtered[i])
   832  				p2 := int(filtered[i+1])
   833  				for p1+prefixLen < n &&
   834  					p2+prefixLen < n &&
   835  					superstring[(p1+prefixLen)*2] != 0 &&
   836  					superstring[(p2+prefixLen)*2] != 0 &&
   837  					superstring[(p1+prefixLen)*2+1] == superstring[(p2+prefixLen)*2+1] {
   838  					prefixLen++
   839  				}
   840  				if prefixLen != int(lcp[i]) {
   841  					logger.Error("Mismatch", "prefixLen", prefixLen, "lcp[i]", lcp[i], "i", i)
   842  					break
   843  				}
   844  				l := int(lcp[i]) // Length of potential dictionary word
   845  				if l < 2 {
   846  					continue
   847  				}
   848  			}
   849  		}
   850  		//logger.Info("LCP array checked")
   851  		// Walk over LCP array and compute the scores of the strings
   852  		var b = inv
   853  		j = 0
   854  		for i := 0; i < n-1; i++ {
   855  			// Only when there is a drop in LCP value
   856  			if lcp[i+1] >= lcp[i] {
   857  				j = i
   858  				continue
   859  			}
   860  			prevSkipped := false
   861  			for l := int(lcp[i]); l > int(lcp[i+1]) && l >= minPatternLen; l-- {
   862  				if l > maxPatternLen ||
   863  					l > 20 && (l&(l-1)) != 0 { // is power of 2
   864  					prevSkipped = true
   865  					continue
   866  				}
   867  
   868  				// Go back
   869  				var isNew bool
   870  				for j > 0 && int(lcp[j-1]) >= l {
   871  					j--
   872  					isNew = true
   873  				}
   874  
   875  				if !isNew && !prevSkipped {
   876  					break
   877  				}
   878  
   879  				window := i - j + 2
   880  				copy(b, filtered[j:i+2])
   881  				slices.Sort(b[:window])
   882  				repeats := 1
   883  				lastK := 0
   884  				for k := 1; k < window; k++ {
   885  					if b[k] >= b[lastK]+int32(l) {
   886  						repeats++
   887  						lastK = k
   888  					}
   889  				}
   890  
   891  				if (l < 8 || l > 64) && repeats < int(minPatternScore) {
   892  					prevSkipped = true
   893  					continue
   894  				}
   895  
   896  				score := uint64(repeats * (l))
   897  				if score < minPatternScore {
   898  					prevSkipped = true
   899  					continue
   900  				}
   901  
   902  				dictKey = dictKey[:l]
   903  				for s := 0; s < l; s++ {
   904  					dictKey[s] = superstring[(int(filtered[i])+s)*2+1]
   905  				}
   906  				binary.BigEndian.PutUint64(dictVal, score)
   907  				if err := dictCollector.Collect(dictKey, dictVal); err != nil {
   908  					logger.Error("processSuperstring", "collect", err)
   909  				}
   910  				prevSkipped = false //nolint
   911  				break
   912  			}
   913  		}
   914  	}
   915  }
   916  
   917  func DictionaryBuilderFromCollectors(ctx context.Context, logPrefix, tmpDir string, collectors []*etl.Collector, lvl log.Lvl, logger log.Logger) (*DictionaryBuilder, error) {
   918  	dictCollector := etl.NewCollector(logPrefix+"_collectDict", tmpDir, etl.NewSortableBuffer(etl.BufferOptimalSize), logger)
   919  	defer dictCollector.Close()
   920  	dictCollector.LogLvl(lvl)
   921  
   922  	dictAggregator := &DictAggregator{collector: dictCollector, dist: map[int]int{}}
   923  	for _, collector := range collectors {
   924  		if err := collector.Load(nil, "", dictAggregator.aggLoadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil {
   925  			return nil, err
   926  		}
   927  		collector.Close()
   928  	}
   929  	if err := dictAggregator.finish(); err != nil {
   930  		return nil, err
   931  	}
   932  	db := &DictionaryBuilder{limit: maxDictPatterns} // Only collect 1m words with highest scores
   933  	if err := dictCollector.Load(nil, "", db.loadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil {
   934  		return nil, err
   935  	}
   936  	db.finish()
   937  
   938  	db.Sort()
   939  	return db, nil
   940  }
   941  
   942  func PersistDictrionary(fileName string, db *DictionaryBuilder) error {
   943  	df, err := os.Create(fileName)
   944  	if err != nil {
   945  		return err
   946  	}
   947  	w := bufio.NewWriterSize(df, 2*etl.BufIOSize)
   948  	db.ForEach(func(score uint64, word []byte) { fmt.Fprintf(w, "%d %x\n", score, word) })
   949  	if err = w.Flush(); err != nil {
   950  		return err
   951  	}
   952  	if err := df.Sync(); err != nil {
   953  		return err
   954  	}
   955  	return df.Close()
   956  }
   957  
   958  func ReadSimpleFile(fileName string, walker func(v []byte) error) error {
   959  	// Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values)
   960  	// We only consider values with length > 2, because smaller values are not compressible without going into bits
   961  	f, err := os.Open(fileName)
   962  	if err != nil {
   963  		return err
   964  	}
   965  	defer f.Close()
   966  	r := bufio.NewReaderSize(f, etl.BufIOSize)
   967  	buf := make([]byte, 4096)
   968  	for l, e := binary.ReadUvarint(r); ; l, e = binary.ReadUvarint(r) {
   969  		if e != nil {
   970  			if errors.Is(e, io.EOF) {
   971  				break
   972  			}
   973  			return e
   974  		}
   975  		if len(buf) < int(l) {
   976  			buf = make([]byte, l)
   977  		}
   978  		if _, e = io.ReadFull(r, buf[:l]); e != nil {
   979  			return e
   980  		}
   981  		if err := walker(buf[:l]); err != nil {
   982  			return err
   983  		}
   984  	}
   985  	return nil
   986  }