github.com/ledgerwatch/erigon-lib@v1.0.0/compress/compress.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package compress
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"container/heap"
    23  	"context"
    24  	"encoding/binary"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math/bits"
    29  	"os"
    30  	"path/filepath"
    31  	"sync"
    32  	"time"
    33  
    34  	"github.com/c2h5oh/datasize"
    35  	"github.com/ledgerwatch/erigon-lib/common"
    36  	dir2 "github.com/ledgerwatch/erigon-lib/common/dir"
    37  	"github.com/ledgerwatch/erigon-lib/etl"
    38  	"github.com/ledgerwatch/log/v3"
    39  	"golang.org/x/exp/slices"
    40  )
    41  
    42  // Compressor is the main operating type for performing per-word compression
    43  // After creating a compression, one needs to add superstrings to it, using `AddWord` function
    44  // In order to add word without compression, function `AddUncompressedWord` needs to be used
    45  // Compressor only tracks which words are compressed and which are not until the compressed
    46  // file is created. After that, the user of the file needs to know when to call
    47  // `Next` or `NextUncompressed` function on the decompressor.
    48  // After that, `Compress` function needs to be called to perform the compression
    49  // and eventually create output file
    50  type Compressor struct {
    51  	ctx              context.Context
    52  	wg               *sync.WaitGroup
    53  	superstrings     chan []byte
    54  	uncompressedFile *DecompressedFile
    55  	tmpDir           string // temporary directory to use for ETL when building dictionary
    56  	logPrefix        string
    57  	outputFile       string // File where to output the dictionary and compressed data
    58  	tmpOutFilePath   string // File where to output the dictionary and compressed data
    59  	suffixCollectors []*etl.Collector
    60  	// Buffer for "superstring" - transformation of superstrings where each byte of a word, say b,
    61  	// is turned into 2 bytes, 0x01 and b, and two zero bytes 0x00 0x00 are inserted after each word
    62  	// this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix
    63  	// sorting algorithm
    64  	superstring      []byte
    65  	wordsCount       uint64
    66  	superstringCount uint64
    67  	superstringLen   int
    68  	workers          int
    69  	Ratio            CompressionRatio
    70  	lvl              log.Lvl
    71  	trace            bool
    72  	logger           log.Logger
    73  	noFsync          bool // fsync is enabled by default, but tests can manually disable
    74  }
    75  
    76  func NewCompressor(ctx context.Context, logPrefix, outputFile, tmpDir string, minPatternScore uint64, workers int, lvl log.Lvl, logger log.Logger) (*Compressor, error) {
    77  	dir2.MustExist(tmpDir)
    78  	dir, fileName := filepath.Split(outputFile)
    79  	tmpOutFilePath := filepath.Join(dir, fileName) + ".tmp"
    80  	// UncompressedFile - it's intermediate .idt file, outputFile it's final .seg (or .dat) file.
    81  	// tmpOutFilePath - it's ".seg.tmp" (".idt.tmp") file which will be renamed to .seg file if everything succeed.
    82  	// It allow atomically create .seg file (downloader will not see partially ready/ non-ready .seg files).
    83  	// I didn't create ".seg.tmp" file in tmpDir, because I think tmpDir and snapsthoDir may be mounted to different drives
    84  	uncompressedPath := filepath.Join(tmpDir, fileName) + ".idt"
    85  
    86  	uncompressedFile, err := NewUncompressedFile(uncompressedPath)
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  
    91  	// Collector for dictionary superstrings (sorted by their score)
    92  	superstrings := make(chan []byte, workers*2)
    93  	wg := &sync.WaitGroup{}
    94  	wg.Add(workers)
    95  	suffixCollectors := make([]*etl.Collector, workers)
    96  	for i := 0; i < workers; i++ {
    97  		collector := etl.NewCollector(logPrefix+"_dict", tmpDir, etl.NewSortableBuffer(etl.BufferOptimalSize/2), logger)
    98  		collector.LogLvl(lvl)
    99  
   100  		suffixCollectors[i] = collector
   101  		go processSuperstring(ctx, superstrings, collector, minPatternScore, wg, logger)
   102  	}
   103  
   104  	return &Compressor{
   105  		uncompressedFile: uncompressedFile,
   106  		tmpOutFilePath:   tmpOutFilePath,
   107  		outputFile:       outputFile,
   108  		tmpDir:           tmpDir,
   109  		logPrefix:        logPrefix,
   110  		workers:          workers,
   111  		ctx:              ctx,
   112  		superstrings:     superstrings,
   113  		suffixCollectors: suffixCollectors,
   114  		lvl:              lvl,
   115  		wg:               wg,
   116  		logger:           logger,
   117  	}, nil
   118  }
   119  
   120  func (c *Compressor) Close() {
   121  	c.uncompressedFile.Close()
   122  	for _, collector := range c.suffixCollectors {
   123  		collector.Close()
   124  	}
   125  	c.suffixCollectors = nil
   126  }
   127  
   128  func (c *Compressor) SetTrace(trace bool) { c.trace = trace }
   129  
   130  func (c *Compressor) Count() int { return int(c.wordsCount) }
   131  
   132  func (c *Compressor) AddWord(word []byte) error {
   133  	select {
   134  	case <-c.ctx.Done():
   135  		return c.ctx.Err()
   136  	default:
   137  	}
   138  
   139  	c.wordsCount++
   140  	l := 2*len(word) + 2
   141  	if c.superstringLen+l > superstringLimit {
   142  		if c.superstringCount%samplingFactor == 0 {
   143  			c.superstrings <- c.superstring
   144  		}
   145  		c.superstringCount++
   146  		c.superstring = make([]byte, 0, 1024*1024)
   147  		c.superstringLen = 0
   148  	}
   149  	c.superstringLen += l
   150  
   151  	if c.superstringCount%samplingFactor == 0 {
   152  		for _, a := range word {
   153  			c.superstring = append(c.superstring, 1, a)
   154  		}
   155  		c.superstring = append(c.superstring, 0, 0)
   156  	}
   157  
   158  	return c.uncompressedFile.Append(word)
   159  }
   160  
   161  func (c *Compressor) AddUncompressedWord(word []byte) error {
   162  	select {
   163  	case <-c.ctx.Done():
   164  		return c.ctx.Err()
   165  	default:
   166  	}
   167  
   168  	c.wordsCount++
   169  	return c.uncompressedFile.AppendUncompressed(word)
   170  }
   171  
   172  func (c *Compressor) Compress() error {
   173  	c.uncompressedFile.w.Flush()
   174  	logEvery := time.NewTicker(20 * time.Second)
   175  	defer logEvery.Stop()
   176  	if len(c.superstring) > 0 {
   177  		c.superstrings <- c.superstring
   178  	}
   179  	close(c.superstrings)
   180  	c.wg.Wait()
   181  
   182  	if c.lvl < log.LvlTrace {
   183  		c.logger.Log(c.lvl, fmt.Sprintf("[%s] BuildDict start", c.logPrefix), "workers", c.workers)
   184  	}
   185  	t := time.Now()
   186  	db, err := DictionaryBuilderFromCollectors(c.ctx, compressLogPrefix, c.tmpDir, c.suffixCollectors, c.lvl, c.logger)
   187  	if err != nil {
   188  
   189  		return err
   190  	}
   191  	if c.trace {
   192  		_, fileName := filepath.Split(c.outputFile)
   193  		if err := PersistDictrionary(filepath.Join(c.tmpDir, fileName)+".dictionary.txt", db); err != nil {
   194  			return err
   195  		}
   196  	}
   197  	defer os.Remove(c.tmpOutFilePath)
   198  	if c.lvl < log.LvlTrace {
   199  		c.logger.Log(c.lvl, fmt.Sprintf("[%s] BuildDict", c.logPrefix), "took", time.Since(t))
   200  	}
   201  
   202  	cf, err := os.Create(c.tmpOutFilePath)
   203  	if err != nil {
   204  		return err
   205  	}
   206  	defer cf.Close()
   207  	t = time.Now()
   208  	if err := reducedict(c.ctx, c.trace, c.logPrefix, c.tmpOutFilePath, cf, c.uncompressedFile, c.workers, db, c.lvl, c.logger); err != nil {
   209  		return err
   210  	}
   211  	if err = c.fsync(cf); err != nil {
   212  		return err
   213  	}
   214  	if err = cf.Close(); err != nil {
   215  		return err
   216  	}
   217  	if err := os.Rename(c.tmpOutFilePath, c.outputFile); err != nil {
   218  		return fmt.Errorf("renaming: %w", err)
   219  	}
   220  
   221  	c.Ratio, err = Ratio(c.uncompressedFile.filePath, c.outputFile)
   222  	if err != nil {
   223  		return fmt.Errorf("ratio: %w", err)
   224  	}
   225  
   226  	_, fName := filepath.Split(c.outputFile)
   227  	if c.lvl < log.LvlTrace {
   228  		c.logger.Log(c.lvl, fmt.Sprintf("[%s] Compress", c.logPrefix), "took", time.Since(t), "ratio", c.Ratio, "file", fName)
   229  	}
   230  	return nil
   231  }
   232  
   233  func (c *Compressor) DisableFsync() { c.noFsync = true }
   234  
   235  // fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes.
   236  // To achieve it: write to .tmp file then `rename` when file is ready.
   237  // Machine may power-off right after `rename` - it means `fsync` must be before `rename`
   238  func (c *Compressor) fsync(f *os.File) error {
   239  	if c.noFsync {
   240  		return nil
   241  	}
   242  	if err := f.Sync(); err != nil {
   243  		c.logger.Warn("couldn't fsync", "err", err, "file", c.tmpOutFilePath)
   244  		return err
   245  	}
   246  	return nil
   247  }
   248  
   249  // superstringLimit limits how large can one "superstring" get before it is processed
   250  // CompressorSequential allocates 7 bytes for each uint of superstringLimit. For example,
   251  // superstingLimit 16m will result in 112Mb being allocated for various arrays
   252  const superstringLimit = 16 * 1024 * 1024
   253  
   254  // minPatternLen is minimum length of pattern we consider to be included into the dictionary
   255  const minPatternLen = 5
   256  const maxPatternLen = 128
   257  
   258  // maxDictPatterns is the maximum number of patterns allowed in the initial (not reduced dictionary)
   259  // Large values increase memory consumption of dictionary reduction phase
   260  /*
   261  Experiments on 74Gb uncompressed file (bsc 012500-013000-transactions.seg)
   262  Ram - needed just to open compressed file (Huff tables, etc...)
   263  dec_speed - loop with `word, _ = g.Next(word[:0])`
   264  skip_speed - loop with `g.Skip()`
   265  | DictSize | Ram  | file_size | dec_speed | skip_speed |
   266  | -------- | ---- | --------- | --------- | ---------- |
   267  | 1M       | 70Mb | 35871Mb   | 4m06s     | 1m58s      |
   268  | 512K     | 42Mb | 36496Mb   | 3m49s     | 1m51s      |
   269  | 256K     | 21Mb | 37100Mb   | 3m44s     | 1m48s      |
   270  | 128K     | 11Mb | 37782Mb   | 3m25s     | 1m44s      |
   271  | 64K      | 7Mb  | 38597Mb   | 3m16s     | 1m34s      |
   272  | 32K      | 5Mb  | 39626Mb   | 3m0s      | 1m29s      |
   273  
   274  */
   275  const maxDictPatterns = 64 * 1024
   276  
   277  // samplingFactor - skip superstrings if `superstringNumber % samplingFactor != 0`
   278  const samplingFactor = 4
   279  
   280  // nolint
   281  const compressLogPrefix = "compress"
   282  
   283  type DictionaryBuilder struct {
   284  	lastWord      []byte
   285  	items         []*Pattern
   286  	limit         int
   287  	lastWordScore uint64
   288  }
   289  
   290  func (db *DictionaryBuilder) Reset(limit int) {
   291  	db.limit = limit
   292  	db.items = db.items[:0]
   293  }
   294  
   295  func (db *DictionaryBuilder) Len() int { return len(db.items) }
   296  func (db *DictionaryBuilder) Less(i, j int) bool {
   297  	if db.items[i].score == db.items[j].score {
   298  		return bytes.Compare(db.items[i].word, db.items[j].word) < 0
   299  	}
   300  	return db.items[i].score < db.items[j].score
   301  }
   302  
   303  func dictionaryBuilderLess(i, j *Pattern) bool {
   304  	if i.score == j.score {
   305  		return bytes.Compare(i.word, j.word) < 0
   306  	}
   307  	return i.score < j.score
   308  }
   309  
   310  func (db *DictionaryBuilder) Swap(i, j int) {
   311  	db.items[i], db.items[j] = db.items[j], db.items[i]
   312  }
   313  func (db *DictionaryBuilder) Sort() { slices.SortFunc(db.items, dictionaryBuilderLess) }
   314  
   315  func (db *DictionaryBuilder) Push(x interface{}) {
   316  	db.items = append(db.items, x.(*Pattern))
   317  }
   318  
   319  func (db *DictionaryBuilder) Pop() interface{} {
   320  	old := db.items
   321  	n := len(old)
   322  	x := old[n-1]
   323  	old[n-1] = nil
   324  	db.items = old[0 : n-1]
   325  	return x
   326  }
   327  
   328  func (db *DictionaryBuilder) processWord(chars []byte, score uint64) {
   329  	heap.Push(db, &Pattern{word: common.Copy(chars), score: score})
   330  	if db.Len() > db.limit {
   331  		// Remove the element with smallest score
   332  		heap.Pop(db)
   333  	}
   334  }
   335  
   336  func (db *DictionaryBuilder) loadFunc(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error {
   337  	score := binary.BigEndian.Uint64(v)
   338  	if bytes.Equal(k, db.lastWord) {
   339  		db.lastWordScore += score
   340  	} else {
   341  		if db.lastWord != nil {
   342  			db.processWord(db.lastWord, db.lastWordScore)
   343  		}
   344  		db.lastWord = append(db.lastWord[:0], k...)
   345  		db.lastWordScore = score
   346  	}
   347  	return nil
   348  }
   349  
   350  func (db *DictionaryBuilder) finish() {
   351  	if db.lastWord != nil {
   352  		db.processWord(db.lastWord, db.lastWordScore)
   353  	}
   354  }
   355  
   356  func (db *DictionaryBuilder) ForEach(f func(score uint64, word []byte)) {
   357  	for i := db.Len(); i > 0; i-- {
   358  		f(db.items[i-1].score, db.items[i-1].word)
   359  	}
   360  }
   361  
   362  func (db *DictionaryBuilder) Close() {
   363  	db.items = nil
   364  	db.lastWord = nil
   365  }
   366  
   367  // Pattern is representation of a pattern that is searched in the superstrings to compress them
   368  // patterns are stored in a patricia tree and contain pattern score (calculated during
   369  // the initial dictionary building), frequency of usage, and code
   370  type Pattern struct {
   371  	word     []byte // Pattern characters
   372  	score    uint64 // Score assigned to the pattern during dictionary building
   373  	uses     uint64 // How many times this pattern has been used during search and optimisation
   374  	code     uint64 // Allocated numerical code
   375  	codeBits int    // Number of bits in the code
   376  	depth    int    // Depth of the pattern in the huffman tree (for encoding in the file)
   377  }
   378  
   379  // PatternList is a sorted list of pattern for the purpose of
   380  // building Huffman tree to determine efficient coding.
   381  // Patterns with least usage come first, we use numerical code
   382  // as a tie breaker to make sure the resulting Huffman code is canonical
   383  type PatternList []*Pattern
   384  
   385  func (pl PatternList) Len() int { return len(pl) }
   386  func patternListLess(i, j *Pattern) bool {
   387  	if i.uses == j.uses {
   388  		return bits.Reverse64(i.code) < bits.Reverse64(j.code)
   389  	}
   390  	return i.uses < j.uses
   391  }
   392  
   393  // PatternHuff is an intermediate node in a huffman tree of patterns
   394  // It has two children, each of which may either be another intermediate node (h0 or h1)
   395  // or leaf node, which is Pattern (p0 or p1).
   396  type PatternHuff struct {
   397  	p0         *Pattern
   398  	p1         *Pattern
   399  	h0         *PatternHuff
   400  	h1         *PatternHuff
   401  	uses       uint64
   402  	tieBreaker uint64
   403  }
   404  
   405  func (h *PatternHuff) AddZero() {
   406  	if h.p0 != nil {
   407  		h.p0.code <<= 1
   408  		h.p0.codeBits++
   409  	} else {
   410  		h.h0.AddZero()
   411  	}
   412  	if h.p1 != nil {
   413  		h.p1.code <<= 1
   414  		h.p1.codeBits++
   415  	} else {
   416  		h.h1.AddZero()
   417  	}
   418  }
   419  
   420  func (h *PatternHuff) AddOne() {
   421  	if h.p0 != nil {
   422  		h.p0.code <<= 1
   423  		h.p0.code++
   424  		h.p0.codeBits++
   425  	} else {
   426  		h.h0.AddOne()
   427  	}
   428  	if h.p1 != nil {
   429  		h.p1.code <<= 1
   430  		h.p1.code++
   431  		h.p1.codeBits++
   432  	} else {
   433  		h.h1.AddOne()
   434  	}
   435  }
   436  
   437  func (h *PatternHuff) SetDepth(depth int) {
   438  	if h.p0 != nil {
   439  		h.p0.depth = depth + 1
   440  		h.p0.uses = 0
   441  	}
   442  	if h.p1 != nil {
   443  		h.p1.depth = depth + 1
   444  		h.p1.uses = 0
   445  	}
   446  	if h.h0 != nil {
   447  		h.h0.SetDepth(depth + 1)
   448  	}
   449  	if h.h1 != nil {
   450  		h.h1.SetDepth(depth + 1)
   451  	}
   452  }
   453  
   454  // PatternHeap is priority queue of pattern for the purpose of building
   455  // Huffman tree to determine efficient coding. Patterns with least usage
   456  // have highest priority. We use a tie-breaker to make sure
   457  // the resulting Huffman code is canonical
   458  type PatternHeap []*PatternHuff
   459  
   460  func (ph PatternHeap) Len() int {
   461  	return len(ph)
   462  }
   463  
   464  func (ph PatternHeap) Less(i, j int) bool {
   465  	if ph[i].uses == ph[j].uses {
   466  		return ph[i].tieBreaker < ph[j].tieBreaker
   467  	}
   468  	return ph[i].uses < ph[j].uses
   469  }
   470  
   471  func (ph *PatternHeap) Swap(i, j int) {
   472  	(*ph)[i], (*ph)[j] = (*ph)[j], (*ph)[i]
   473  }
   474  
   475  func (ph *PatternHeap) Push(x interface{}) {
   476  	*ph = append(*ph, x.(*PatternHuff))
   477  }
   478  
   479  func (ph *PatternHeap) Pop() interface{} {
   480  	old := *ph
   481  	n := len(old)
   482  	x := old[n-1]
   483  	old[n-1] = nil
   484  	*ph = old[0 : n-1]
   485  	return x
   486  }
   487  
   488  type Position struct {
   489  	uses     uint64
   490  	pos      uint64
   491  	code     uint64
   492  	codeBits int
   493  	depth    int // Depth of the position in the huffman tree (for encoding in the file)
   494  }
   495  
   496  type PositionHuff struct {
   497  	p0         *Position
   498  	p1         *Position
   499  	h0         *PositionHuff
   500  	h1         *PositionHuff
   501  	uses       uint64
   502  	tieBreaker uint64
   503  }
   504  
   505  func (h *PositionHuff) AddZero() {
   506  	if h.p0 != nil {
   507  		h.p0.code <<= 1
   508  		h.p0.codeBits++
   509  	} else {
   510  		h.h0.AddZero()
   511  	}
   512  	if h.p1 != nil {
   513  		h.p1.code <<= 1
   514  		h.p1.codeBits++
   515  	} else {
   516  		h.h1.AddZero()
   517  	}
   518  }
   519  
   520  func (h *PositionHuff) AddOne() {
   521  	if h.p0 != nil {
   522  		h.p0.code <<= 1
   523  		h.p0.code++
   524  		h.p0.codeBits++
   525  	} else {
   526  		h.h0.AddOne()
   527  	}
   528  	if h.p1 != nil {
   529  		h.p1.code <<= 1
   530  		h.p1.code++
   531  		h.p1.codeBits++
   532  	} else {
   533  		h.h1.AddOne()
   534  	}
   535  }
   536  
   537  func (h *PositionHuff) SetDepth(depth int) {
   538  	if h.p0 != nil {
   539  		h.p0.depth = depth + 1
   540  		h.p0.uses = 0
   541  	}
   542  	if h.p1 != nil {
   543  		h.p1.depth = depth + 1
   544  		h.p1.uses = 0
   545  	}
   546  	if h.h0 != nil {
   547  		h.h0.SetDepth(depth + 1)
   548  	}
   549  	if h.h1 != nil {
   550  		h.h1.SetDepth(depth + 1)
   551  	}
   552  }
   553  
   554  type PositionList []*Position
   555  
   556  func (pl PositionList) Len() int { return len(pl) }
   557  
   558  func positionListLess(i, j *Position) bool {
   559  	if i.uses == j.uses {
   560  		return bits.Reverse64(i.code) < bits.Reverse64(j.code)
   561  	}
   562  	return i.uses < j.uses
   563  }
   564  
   565  type PositionHeap []*PositionHuff
   566  
   567  func (ph PositionHeap) Len() int {
   568  	return len(ph)
   569  }
   570  
   571  func (ph PositionHeap) Less(i, j int) bool {
   572  	if ph[i].uses == ph[j].uses {
   573  		return ph[i].tieBreaker < ph[j].tieBreaker
   574  	}
   575  	return ph[i].uses < ph[j].uses
   576  }
   577  
   578  func (ph *PositionHeap) Swap(i, j int) {
   579  	(*ph)[i], (*ph)[j] = (*ph)[j], (*ph)[i]
   580  }
   581  
   582  func (ph *PositionHeap) Push(x interface{}) {
   583  	*ph = append(*ph, x.(*PositionHuff))
   584  }
   585  
   586  func (ph *PositionHeap) Pop() interface{} {
   587  	old := *ph
   588  	n := len(old)
   589  	x := old[n-1]
   590  	old[n-1] = nil
   591  	*ph = old[0 : n-1]
   592  	return x
   593  }
   594  
   595  type HuffmanCoder struct {
   596  	w          *bufio.Writer
   597  	outputBits int
   598  	outputByte byte
   599  }
   600  
   601  func (hf *HuffmanCoder) encode(code uint64, codeBits int) error {
   602  	for codeBits > 0 {
   603  		var bitsUsed int
   604  		if hf.outputBits+codeBits > 8 {
   605  			bitsUsed = 8 - hf.outputBits
   606  		} else {
   607  			bitsUsed = codeBits
   608  		}
   609  		mask := (uint64(1) << bitsUsed) - 1
   610  		hf.outputByte |= byte((code & mask) << hf.outputBits)
   611  		code >>= bitsUsed
   612  		codeBits -= bitsUsed
   613  		hf.outputBits += bitsUsed
   614  		if hf.outputBits == 8 {
   615  			if e := hf.w.WriteByte(hf.outputByte); e != nil {
   616  				return e
   617  			}
   618  			hf.outputBits = 0
   619  			hf.outputByte = 0
   620  		}
   621  	}
   622  	return nil
   623  }
   624  
   625  func (hf *HuffmanCoder) flush() error {
   626  	if hf.outputBits > 0 {
   627  		if e := hf.w.WriteByte(hf.outputByte); e != nil {
   628  			return e
   629  		}
   630  		hf.outputBits = 0
   631  		hf.outputByte = 0
   632  	}
   633  	return nil
   634  }
   635  
   636  // DynamicCell represents result of dynamic programming for certain starting position
   637  type DynamicCell struct {
   638  	optimStart  int
   639  	coverStart  int
   640  	compression int
   641  	score       uint64
   642  	patternIdx  int // offset of the last element in the pattern slice
   643  }
   644  
   645  type Ring struct {
   646  	cells             []DynamicCell
   647  	head, tail, count int
   648  }
   649  
   650  func NewRing() *Ring {
   651  	return &Ring{
   652  		cells: make([]DynamicCell, 16),
   653  		head:  0,
   654  		tail:  0,
   655  		count: 0,
   656  	}
   657  }
   658  
   659  func (r *Ring) Reset() {
   660  	r.count = 0
   661  	r.head = 0
   662  	r.tail = 0
   663  }
   664  
   665  func (r *Ring) ensureSize() {
   666  	if r.count < len(r.cells) {
   667  		return
   668  	}
   669  	newcells := make([]DynamicCell, r.count*2)
   670  	if r.tail > r.head {
   671  		copy(newcells, r.cells[r.head:r.tail])
   672  	} else {
   673  		n := copy(newcells, r.cells[r.head:])
   674  		copy(newcells[n:], r.cells[:r.tail])
   675  	}
   676  	r.head = 0
   677  	r.tail = r.count
   678  	r.cells = newcells
   679  }
   680  
   681  func (r *Ring) PushFront() *DynamicCell {
   682  	r.ensureSize()
   683  	if r.head == 0 {
   684  		r.head = len(r.cells)
   685  	}
   686  	r.head--
   687  	r.count++
   688  	return &r.cells[r.head]
   689  }
   690  
   691  func (r *Ring) PushBack() *DynamicCell {
   692  	r.ensureSize()
   693  	if r.tail == len(r.cells) {
   694  		r.tail = 0
   695  	}
   696  	result := &r.cells[r.tail]
   697  	r.tail++
   698  	r.count++
   699  	return result
   700  }
   701  
   702  func (r Ring) Len() int {
   703  	return r.count
   704  }
   705  
   706  func (r *Ring) Get(i int) *DynamicCell {
   707  	if i < 0 || i >= r.count {
   708  		return nil
   709  	}
   710  	return &r.cells[(r.head+i)&(len(r.cells)-1)]
   711  }
   712  
   713  // Truncate removes all items starting from i
   714  func (r *Ring) Truncate(i int) {
   715  	r.count = i
   716  	r.tail = (r.head + i) & (len(r.cells) - 1)
   717  }
   718  
   719  type DictAggregator struct {
   720  	collector     *etl.Collector
   721  	dist          map[int]int
   722  	lastWord      []byte
   723  	lastWordScore uint64
   724  }
   725  
   726  func (da *DictAggregator) processWord(word []byte, score uint64) error {
   727  	var scoreBuf [8]byte
   728  	binary.BigEndian.PutUint64(scoreBuf[:], score)
   729  	return da.collector.Collect(word, scoreBuf[:])
   730  }
   731  
   732  func (da *DictAggregator) Load(loadFunc etl.LoadFunc, args etl.TransformArgs) error {
   733  	defer da.collector.Close()
   734  	return da.collector.Load(nil, "", loadFunc, args)
   735  }
   736  
   737  func (da *DictAggregator) aggLoadFunc(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error {
   738  	if _, ok := da.dist[len(k)]; !ok {
   739  		da.dist[len(k)] = 0
   740  	}
   741  	da.dist[len(k)]++
   742  
   743  	score := binary.BigEndian.Uint64(v)
   744  	if bytes.Equal(k, da.lastWord) {
   745  		da.lastWordScore += score
   746  	} else {
   747  		if da.lastWord != nil {
   748  			if err := da.processWord(da.lastWord, da.lastWordScore); err != nil {
   749  				return err
   750  			}
   751  		}
   752  		da.lastWord = append(da.lastWord[:0], k...)
   753  		da.lastWordScore = score
   754  	}
   755  	return nil
   756  }
   757  
   758  func (da *DictAggregator) finish() error {
   759  	if da.lastWord != nil {
   760  		return da.processWord(da.lastWord, da.lastWordScore)
   761  	}
   762  	return nil
   763  }
   764  
   765  type CompressionRatio float64
   766  
   767  func (r CompressionRatio) String() string { return fmt.Sprintf("%.2f", r) }
   768  
   769  func Ratio(f1, f2 string) (CompressionRatio, error) {
   770  	s1, err := os.Stat(f1)
   771  	if err != nil {
   772  		return 0, err
   773  	}
   774  	s2, err := os.Stat(f2)
   775  	if err != nil {
   776  		return 0, err
   777  	}
   778  	return CompressionRatio(float64(s1.Size()) / float64(s2.Size())), nil
   779  }
   780  
   781  // DecompressedFile - .dat file format - simple format for temporary data store
   782  type DecompressedFile struct {
   783  	f        *os.File
   784  	w        *bufio.Writer
   785  	filePath string
   786  	buf      []byte
   787  	count    uint64
   788  }
   789  
   790  func NewUncompressedFile(filePath string) (*DecompressedFile, error) {
   791  	f, err := os.Create(filePath)
   792  	if err != nil {
   793  		return nil, err
   794  	}
   795  	w := bufio.NewWriterSize(f, 2*etl.BufIOSize)
   796  	return &DecompressedFile{filePath: filePath, f: f, w: w, buf: make([]byte, 128)}, nil
   797  }
   798  func (f *DecompressedFile) Close() {
   799  	f.w.Flush()
   800  	f.f.Close()
   801  	os.Remove(f.filePath)
   802  }
   803  func (f *DecompressedFile) Append(v []byte) error {
   804  	f.count++
   805  	// For compressed words, the length prefix is shifted to make lowest bit zero
   806  	n := binary.PutUvarint(f.buf, 2*uint64(len(v)))
   807  	if _, e := f.w.Write(f.buf[:n]); e != nil {
   808  		return e
   809  	}
   810  	if len(v) > 0 {
   811  		if _, e := f.w.Write(v); e != nil {
   812  			return e
   813  		}
   814  	}
   815  	return nil
   816  }
   817  func (f *DecompressedFile) AppendUncompressed(v []byte) error {
   818  	f.count++
   819  	// For uncompressed words, the length prefix is shifted to make lowest bit one
   820  	n := binary.PutUvarint(f.buf, 2*uint64(len(v))+1)
   821  	if _, e := f.w.Write(f.buf[:n]); e != nil {
   822  		return e
   823  	}
   824  	if len(v) > 0 {
   825  		if _, e := f.w.Write(v); e != nil {
   826  			return e
   827  		}
   828  	}
   829  	return nil
   830  }
   831  
   832  // ForEach - Read keys from the file and generate superstring (with extra byte 0x1 prepended to each character, and with 0x0 0x0 pair inserted between keys and values)
   833  // We only consider values with length > 2, because smaller values are not compressible without going into bits
   834  func (f *DecompressedFile) ForEach(walker func(v []byte, compressed bool) error) error {
   835  	_, err := f.f.Seek(0, 0)
   836  	if err != nil {
   837  		return err
   838  	}
   839  	r := bufio.NewReaderSize(f.f, int(8*datasize.MB))
   840  	buf := make([]byte, 16*1024)
   841  	l, e := binary.ReadUvarint(r)
   842  	for ; e == nil; l, e = binary.ReadUvarint(r) {
   843  		// extract lowest bit of length prefix as "uncompressed" flag and shift to obtain correct length
   844  		compressed := (l & 1) == 0
   845  		l >>= 1
   846  		if len(buf) < int(l) {
   847  			buf = make([]byte, l)
   848  		}
   849  		if _, e = io.ReadFull(r, buf[:l]); e != nil {
   850  			return e
   851  		}
   852  		if err := walker(buf[:l], compressed); err != nil {
   853  			return err
   854  		}
   855  	}
   856  	if e != nil && !errors.Is(e, io.EOF) {
   857  		return e
   858  	}
   859  	return nil
   860  }