github.com/ledgerwatch/erigon-lib@v1.0.0/state/domain.go (about)

     1  /*
     2     Copyright 2022 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package state
    18  
    19  import (
    20  	"bytes"
    21  	"container/heap"
    22  	"context"
    23  	"encoding/binary"
    24  	"fmt"
    25  	"math"
    26  	"os"
    27  	"path/filepath"
    28  	"regexp"
    29  	"strconv"
    30  	"strings"
    31  	"sync/atomic"
    32  	"time"
    33  
    34  	"github.com/RoaringBitmap/roaring/roaring64"
    35  	"github.com/ledgerwatch/erigon-lib/common/background"
    36  	btree2 "github.com/tidwall/btree"
    37  	"golang.org/x/sync/errgroup"
    38  
    39  	"github.com/ledgerwatch/log/v3"
    40  
    41  	"github.com/ledgerwatch/erigon-lib/common"
    42  	"github.com/ledgerwatch/erigon-lib/common/dir"
    43  	"github.com/ledgerwatch/erigon-lib/compress"
    44  	"github.com/ledgerwatch/erigon-lib/kv"
    45  	"github.com/ledgerwatch/erigon-lib/kv/bitmapdb"
    46  	"github.com/ledgerwatch/erigon-lib/recsplit"
    47  )
    48  
    49  // filesItem corresponding to a pair of files (.dat and .idx)
    50  type filesItem struct {
    51  	decompressor *compress.Decompressor
    52  	index        *recsplit.Index
    53  	bindex       *BtIndex
    54  	startTxNum   uint64
    55  	endTxNum     uint64
    56  
    57  	// Frozen: file of size StepsInBiggestFile. Completely immutable.
    58  	// Cold: file of size < StepsInBiggestFile. Immutable, but can be closed/removed after merge to bigger file.
    59  	// Hot: Stored in DB. Providing Snapshot-Isolation by CopyOnWrite.
    60  	frozen   bool         // immutable, don't need atomic
    61  	refcount atomic.Int32 // only for `frozen=false`
    62  
    63  	// file can be deleted in 2 cases: 1. when `refcount == 0 && canDelete == true` 2. on app startup when `file.isSubsetOfFrozenFile()`
    64  	// other processes (which also reading files, may have same logic)
    65  	canDelete atomic.Bool
    66  }
    67  
    68  func newFilesItem(startTxNum, endTxNum uint64, stepSize uint64) *filesItem {
    69  	startStep := startTxNum / stepSize
    70  	endStep := endTxNum / stepSize
    71  	frozen := endStep-startStep == StepsInBiggestFile
    72  	return &filesItem{startTxNum: startTxNum, endTxNum: endTxNum, frozen: frozen}
    73  }
    74  
    75  func (i *filesItem) isSubsetOf(j *filesItem) bool {
    76  	return (j.startTxNum <= i.startTxNum && i.endTxNum <= j.endTxNum) && (j.startTxNum != i.startTxNum || i.endTxNum != j.endTxNum)
    77  }
    78  
    79  func filesItemLess(i, j *filesItem) bool {
    80  	if i.endTxNum == j.endTxNum {
    81  		return i.startTxNum > j.startTxNum
    82  	}
    83  	return i.endTxNum < j.endTxNum
    84  }
    85  func (i *filesItem) closeFilesAndRemove() {
    86  	if i.decompressor != nil {
    87  		i.decompressor.Close()
    88  		// paranoic-mode on: don't delete frozen files
    89  		if !i.frozen {
    90  			if err := os.Remove(i.decompressor.FilePath()); err != nil {
    91  				log.Trace("close", "err", err, "file", i.decompressor.FileName())
    92  			}
    93  		}
    94  		i.decompressor = nil
    95  	}
    96  	if i.index != nil {
    97  		i.index.Close()
    98  		// paranoic-mode on: don't delete frozen files
    99  		if !i.frozen {
   100  			if err := os.Remove(i.index.FilePath()); err != nil {
   101  				log.Trace("close", "err", err, "file", i.index.FileName())
   102  			}
   103  		}
   104  		i.index = nil
   105  	}
   106  	if i.bindex != nil {
   107  		i.bindex.Close()
   108  		if err := os.Remove(i.bindex.FilePath()); err != nil {
   109  			log.Trace("close", "err", err, "file", i.bindex.FileName())
   110  		}
   111  		i.bindex = nil
   112  	}
   113  }
   114  
   115  type DomainStats struct {
   116  	MergesCount          uint64
   117  	LastCollationTook    time.Duration
   118  	LastPruneTook        time.Duration
   119  	LastPruneHistTook    time.Duration
   120  	LastFileBuildingTook time.Duration
   121  	LastCollationSize    uint64
   122  	LastPruneSize        uint64
   123  
   124  	HistoryQueries *atomic.Uint64
   125  	TotalQueries   *atomic.Uint64
   126  	EfSearchTime   time.Duration
   127  	DataSize       uint64
   128  	IndexSize      uint64
   129  	FilesCount     uint64
   130  }
   131  
   132  func (ds *DomainStats) Accumulate(other DomainStats) {
   133  	ds.HistoryQueries.Add(other.HistoryQueries.Load())
   134  	ds.TotalQueries.Add(other.TotalQueries.Load())
   135  	ds.EfSearchTime += other.EfSearchTime
   136  	ds.IndexSize += other.IndexSize
   137  	ds.DataSize += other.DataSize
   138  	ds.FilesCount += other.FilesCount
   139  }
   140  
   141  // Domain is a part of the state (examples are Accounts, Storage, Code)
   142  // Domain should not have any go routines or locks
   143  type Domain struct {
   144  	/*
   145  	   not large:
   146  	    	keys: key -> ^step
   147  	    	vals: key -> ^step+value (DupSort)
   148  	   large:
   149  	    	keys: key -> ^step
   150  	   	    vals: key + ^step -> value
   151  	*/
   152  
   153  	*History
   154  	files *btree2.BTreeG[*filesItem] // thread-safe, but maybe need 1 RWLock for all trees in AggregatorV3
   155  	// roFiles derivative from field `file`, but without garbage (canDelete=true, overlaps, etc...)
   156  	// MakeContext() using this field in zero-copy way
   157  	roFiles     atomic.Pointer[[]ctxItem]
   158  	defaultDc   *DomainContext
   159  	keysTable   string // key -> invertedStep , invertedStep = ^(txNum / aggregationStep), Needs to be table with DupSort
   160  	valsTable   string // key + invertedStep -> values
   161  	stats       DomainStats
   162  	mergesCount uint64
   163  
   164  	garbageFiles []*filesItem // files that exist on disk, but ignored on opening folder - because they are garbage
   165  	logger       log.Logger
   166  }
   167  
   168  func NewDomain(dir, tmpdir string, aggregationStep uint64,
   169  	filenameBase, keysTable, valsTable, indexKeysTable, historyValsTable, indexTable string,
   170  	compressVals, largeValues bool, logger log.Logger) (*Domain, error) {
   171  	d := &Domain{
   172  		keysTable: keysTable,
   173  		valsTable: valsTable,
   174  		files:     btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}),
   175  		stats:     DomainStats{HistoryQueries: &atomic.Uint64{}, TotalQueries: &atomic.Uint64{}},
   176  		logger:    logger,
   177  	}
   178  	d.roFiles.Store(&[]ctxItem{})
   179  
   180  	var err error
   181  	if d.History, err = NewHistory(dir, tmpdir, aggregationStep, filenameBase, indexKeysTable, indexTable, historyValsTable, compressVals, []string{"kv"}, largeValues, logger); err != nil {
   182  		return nil, err
   183  	}
   184  
   185  	return d, nil
   186  }
   187  
   188  // LastStepInDB - return the latest available step in db (at-least 1 value in such step)
   189  func (d *Domain) LastStepInDB(tx kv.Tx) (lstInDb uint64) {
   190  	lst, _ := kv.FirstKey(tx, d.valsTable)
   191  	if len(lst) > 0 {
   192  		lstInDb = ^binary.BigEndian.Uint64(lst[len(lst)-8:])
   193  	}
   194  	return lstInDb
   195  }
   196  
   197  func (d *Domain) StartWrites() {
   198  	d.defaultDc = d.MakeContext()
   199  	d.History.StartWrites()
   200  }
   201  
   202  func (d *Domain) FinishWrites() {
   203  	d.defaultDc.Close()
   204  	d.History.FinishWrites()
   205  }
   206  
   207  // OpenList - main method to open list of files.
   208  // It's ok if some files was open earlier.
   209  // If some file already open: noop.
   210  // If some file already open but not in provided list: close and remove from `files` field.
   211  func (d *Domain) OpenList(fNames []string) error {
   212  	if err := d.History.OpenList(fNames); err != nil {
   213  		return err
   214  	}
   215  	return d.openList(fNames)
   216  }
   217  
   218  func (d *Domain) openList(fNames []string) error {
   219  	d.closeWhatNotInList(fNames)
   220  	d.garbageFiles = d.scanStateFiles(fNames)
   221  	if err := d.openFiles(); err != nil {
   222  		return fmt.Errorf("History.OpenList: %s, %w", d.filenameBase, err)
   223  	}
   224  	return nil
   225  }
   226  
   227  func (d *Domain) OpenFolder() error {
   228  	files, err := d.fileNamesOnDisk()
   229  	if err != nil {
   230  		return err
   231  	}
   232  	return d.OpenList(files)
   233  }
   234  
   235  func (d *Domain) GetAndResetStats() DomainStats {
   236  	r := d.stats
   237  	r.DataSize, r.IndexSize, r.FilesCount = d.collectFilesStats()
   238  
   239  	d.stats = DomainStats{}
   240  	return r
   241  }
   242  
   243  func (d *Domain) scanStateFiles(fileNames []string) (garbageFiles []*filesItem) {
   244  	re := regexp.MustCompile("^" + d.filenameBase + ".([0-9]+)-([0-9]+).kv$")
   245  	var err error
   246  Loop:
   247  	for _, name := range fileNames {
   248  		subs := re.FindStringSubmatch(name)
   249  		if len(subs) != 3 {
   250  			if len(subs) != 0 {
   251  				d.logger.Warn("File ignored by domain scan, more than 3 submatches", "name", name, "submatches", len(subs))
   252  			}
   253  			continue
   254  		}
   255  		var startStep, endStep uint64
   256  		if startStep, err = strconv.ParseUint(subs[1], 10, 64); err != nil {
   257  			d.logger.Warn("File ignored by domain scan, parsing startTxNum", "error", err, "name", name)
   258  			continue
   259  		}
   260  		if endStep, err = strconv.ParseUint(subs[2], 10, 64); err != nil {
   261  			d.logger.Warn("File ignored by domain scan, parsing endTxNum", "error", err, "name", name)
   262  			continue
   263  		}
   264  		if startStep > endStep {
   265  			d.logger.Warn("File ignored by domain scan, startTxNum > endTxNum", "name", name)
   266  			continue
   267  		}
   268  
   269  		startTxNum, endTxNum := startStep*d.aggregationStep, endStep*d.aggregationStep
   270  		var newFile = newFilesItem(startTxNum, endTxNum, d.aggregationStep)
   271  
   272  		for _, ext := range d.integrityFileExtensions {
   273  			requiredFile := fmt.Sprintf("%s.%d-%d.%s", d.filenameBase, startStep, endStep, ext)
   274  			if !dir.FileExist(filepath.Join(d.dir, requiredFile)) {
   275  				d.logger.Debug(fmt.Sprintf("[snapshots] skip %s because %s doesn't exists", name, requiredFile))
   276  				garbageFiles = append(garbageFiles, newFile)
   277  				continue Loop
   278  			}
   279  		}
   280  
   281  		if _, has := d.files.Get(newFile); has {
   282  			continue
   283  		}
   284  
   285  		addNewFile := true
   286  		var subSets []*filesItem
   287  		d.files.Walk(func(items []*filesItem) bool {
   288  			for _, item := range items {
   289  				if item.isSubsetOf(newFile) {
   290  					subSets = append(subSets, item)
   291  					continue
   292  				}
   293  
   294  				if newFile.isSubsetOf(item) {
   295  					if item.frozen {
   296  						addNewFile = false
   297  						garbageFiles = append(garbageFiles, newFile)
   298  					}
   299  					continue
   300  				}
   301  			}
   302  			return true
   303  		})
   304  		if addNewFile {
   305  			d.files.Set(newFile)
   306  		}
   307  	}
   308  	return garbageFiles
   309  }
   310  
   311  func (d *Domain) openFiles() (err error) {
   312  	var totalKeys uint64
   313  
   314  	invalidFileItems := make([]*filesItem, 0)
   315  	d.files.Walk(func(items []*filesItem) bool {
   316  		for _, item := range items {
   317  			if item.decompressor != nil {
   318  				continue
   319  			}
   320  			fromStep, toStep := item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep
   321  			datPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, fromStep, toStep))
   322  			if !dir.FileExist(datPath) {
   323  				invalidFileItems = append(invalidFileItems, item)
   324  				continue
   325  			}
   326  			if item.decompressor, err = compress.NewDecompressor(datPath); err != nil {
   327  				return false
   328  			}
   329  
   330  			if item.index != nil {
   331  				continue
   332  			}
   333  			idxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, fromStep, toStep))
   334  			if dir.FileExist(idxPath) {
   335  				if item.index, err = recsplit.OpenIndex(idxPath); err != nil {
   336  					d.logger.Debug("InvertedIndex.openFiles: %w, %s", err, idxPath)
   337  					return false
   338  				}
   339  				totalKeys += item.index.KeyCount()
   340  			}
   341  			if item.bindex == nil {
   342  				bidxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep))
   343  				if item.bindex, err = OpenBtreeIndexWithDecompressor(bidxPath, 2048, item.decompressor); err != nil {
   344  					d.logger.Debug("InvertedIndex.openFiles: %w, %s", err, bidxPath)
   345  					return false
   346  				}
   347  				//totalKeys += item.bindex.KeyCount()
   348  			}
   349  		}
   350  		return true
   351  	})
   352  	if err != nil {
   353  		return err
   354  	}
   355  	for _, item := range invalidFileItems {
   356  		d.files.Delete(item)
   357  	}
   358  
   359  	d.reCalcRoFiles()
   360  	return nil
   361  }
   362  
   363  func (d *Domain) closeWhatNotInList(fNames []string) {
   364  	var toDelete []*filesItem
   365  	d.files.Walk(func(items []*filesItem) bool {
   366  	Loop1:
   367  		for _, item := range items {
   368  			for _, protectName := range fNames {
   369  				if item.decompressor != nil && item.decompressor.FileName() == protectName {
   370  					continue Loop1
   371  				}
   372  			}
   373  			toDelete = append(toDelete, item)
   374  		}
   375  		return true
   376  	})
   377  	for _, item := range toDelete {
   378  		if item.decompressor != nil {
   379  			item.decompressor.Close()
   380  			item.decompressor = nil
   381  		}
   382  		if item.index != nil {
   383  			item.index.Close()
   384  			item.index = nil
   385  		}
   386  		if item.bindex != nil {
   387  			item.bindex.Close()
   388  			item.bindex = nil
   389  		}
   390  		d.files.Delete(item)
   391  	}
   392  }
   393  
   394  func (d *Domain) reCalcRoFiles() {
   395  	roFiles := ctxFiles(d.files)
   396  	d.roFiles.Store(&roFiles)
   397  }
   398  
   399  func (d *Domain) Close() {
   400  	d.History.Close()
   401  	d.closeWhatNotInList([]string{})
   402  	d.reCalcRoFiles()
   403  }
   404  
   405  func (dc *DomainContext) get(key []byte, fromTxNum uint64, roTx kv.Tx) ([]byte, bool, error) {
   406  	//var invertedStep [8]byte
   407  	dc.d.stats.TotalQueries.Add(1)
   408  
   409  	invertedStep := dc.numBuf
   410  	binary.BigEndian.PutUint64(invertedStep[:], ^(fromTxNum / dc.d.aggregationStep))
   411  	keyCursor, err := roTx.CursorDupSort(dc.d.keysTable)
   412  	if err != nil {
   413  		return nil, false, err
   414  	}
   415  	defer keyCursor.Close()
   416  	foundInvStep, err := keyCursor.SeekBothRange(key, invertedStep[:])
   417  	if err != nil {
   418  		return nil, false, err
   419  	}
   420  	if len(foundInvStep) == 0 {
   421  		dc.d.stats.HistoryQueries.Add(1)
   422  		return dc.readFromFiles(key, fromTxNum)
   423  	}
   424  	//keySuffix := make([]byte, len(key)+8)
   425  	copy(dc.keyBuf[:], key)
   426  	copy(dc.keyBuf[len(key):], foundInvStep)
   427  	v, err := roTx.GetOne(dc.d.valsTable, dc.keyBuf[:len(key)+8])
   428  	if err != nil {
   429  		return nil, false, err
   430  	}
   431  	return v, true, nil
   432  }
   433  
   434  func (dc *DomainContext) Get(key1, key2 []byte, roTx kv.Tx) ([]byte, error) {
   435  	//key := make([]byte, len(key1)+len(key2))
   436  	copy(dc.keyBuf[:], key1)
   437  	copy(dc.keyBuf[len(key1):], key2)
   438  	// keys larger than 52 bytes will panic
   439  	v, _, err := dc.get(dc.keyBuf[:len(key1)+len(key2)], dc.d.txNum, roTx)
   440  	return v, err
   441  }
   442  
   443  func (d *Domain) update(key, original []byte) error {
   444  	var invertedStep [8]byte
   445  	binary.BigEndian.PutUint64(invertedStep[:], ^(d.txNum / d.aggregationStep))
   446  	if err := d.tx.Put(d.keysTable, key, invertedStep[:]); err != nil {
   447  		return err
   448  	}
   449  	return nil
   450  }
   451  
   452  func (d *Domain) Put(key1, key2, val []byte) error {
   453  	key := make([]byte, len(key1)+len(key2))
   454  	copy(key, key1)
   455  	copy(key[len(key1):], key2)
   456  	original, _, err := d.defaultDc.get(key, d.txNum, d.tx)
   457  	if err != nil {
   458  		return err
   459  	}
   460  	if bytes.Equal(original, val) {
   461  		return nil
   462  	}
   463  	// This call to update needs to happen before d.tx.Put() later, because otherwise the content of `original`` slice is invalidated
   464  	if err = d.History.AddPrevValue(key1, key2, original); err != nil {
   465  		return err
   466  	}
   467  	if err = d.update(key, original); err != nil {
   468  		return err
   469  	}
   470  	invertedStep := ^(d.txNum / d.aggregationStep)
   471  	keySuffix := make([]byte, len(key)+8)
   472  	copy(keySuffix, key)
   473  	binary.BigEndian.PutUint64(keySuffix[len(key):], invertedStep)
   474  	if err = d.tx.Put(d.valsTable, keySuffix, val); err != nil {
   475  		return err
   476  	}
   477  	return nil
   478  }
   479  
   480  func (d *Domain) Delete(key1, key2 []byte) error {
   481  	key := make([]byte, len(key1)+len(key2))
   482  	copy(key, key1)
   483  	copy(key[len(key1):], key2)
   484  	original, found, err := d.defaultDc.get(key, d.txNum, d.tx)
   485  	if err != nil {
   486  		return err
   487  	}
   488  	if !found {
   489  		return nil
   490  	}
   491  	// This call to update needs to happen before d.tx.Delete() later, because otherwise the content of `original`` slice is invalidated
   492  	if err = d.History.AddPrevValue(key1, key2, original); err != nil {
   493  		return err
   494  	}
   495  	if err = d.update(key, original); err != nil {
   496  		return err
   497  	}
   498  	invertedStep := ^(d.txNum / d.aggregationStep)
   499  	keySuffix := make([]byte, len(key)+8)
   500  	copy(keySuffix, key)
   501  	binary.BigEndian.PutUint64(keySuffix[len(key):], invertedStep)
   502  	if err = d.tx.Delete(d.valsTable, keySuffix); err != nil {
   503  		return err
   504  	}
   505  	return nil
   506  }
   507  
   508  type CursorType uint8
   509  
   510  const (
   511  	FILE_CURSOR CursorType = iota
   512  	DB_CURSOR
   513  )
   514  
   515  // CursorItem is the item in the priority queue used to do merge interation
   516  // over storage of a given account
   517  type CursorItem struct {
   518  	c        kv.CursorDupSort
   519  	dg       *compress.Getter
   520  	dg2      *compress.Getter
   521  	key      []byte
   522  	val      []byte
   523  	endTxNum uint64
   524  	t        CursorType // Whether this item represents state file or DB record, or tree
   525  	reverse  bool
   526  }
   527  
   528  type CursorHeap []*CursorItem
   529  
   530  func (ch CursorHeap) Len() int {
   531  	return len(ch)
   532  }
   533  
   534  func (ch CursorHeap) Less(i, j int) bool {
   535  	cmp := bytes.Compare(ch[i].key, ch[j].key)
   536  	if cmp == 0 {
   537  		// when keys match, the items with later blocks are preferred
   538  		if ch[i].reverse {
   539  			return ch[i].endTxNum > ch[j].endTxNum
   540  		}
   541  		return ch[i].endTxNum < ch[j].endTxNum
   542  	}
   543  	return cmp < 0
   544  }
   545  
   546  func (ch *CursorHeap) Swap(i, j int) {
   547  	(*ch)[i], (*ch)[j] = (*ch)[j], (*ch)[i]
   548  }
   549  
   550  func (ch *CursorHeap) Push(x interface{}) {
   551  	*ch = append(*ch, x.(*CursorItem))
   552  }
   553  
   554  func (ch *CursorHeap) Pop() interface{} {
   555  	old := *ch
   556  	n := len(old)
   557  	x := old[n-1]
   558  	old[n-1] = nil
   559  	*ch = old[0 : n-1]
   560  	return x
   561  }
   562  
   563  // filesItem corresponding to a pair of files (.dat and .idx)
   564  type ctxItem struct {
   565  	getter     *compress.Getter
   566  	reader     *recsplit.IndexReader
   567  	startTxNum uint64
   568  	endTxNum   uint64
   569  
   570  	i   int
   571  	src *filesItem
   572  }
   573  
   574  type ctxLocalityIdx struct {
   575  	reader *recsplit.IndexReader
   576  	bm     *bitmapdb.FixedSizeBitmaps
   577  	file   *ctxItem
   578  }
   579  
   580  func ctxItemLess(i, j ctxItem) bool { //nolint
   581  	if i.endTxNum == j.endTxNum {
   582  		return i.startTxNum > j.startTxNum
   583  	}
   584  	return i.endTxNum < j.endTxNum
   585  }
   586  
   587  // DomainContext allows accesing the same domain from multiple go-routines
   588  type DomainContext struct {
   589  	d       *Domain
   590  	files   []ctxItem
   591  	getters []*compress.Getter
   592  	readers []*BtIndex
   593  	hc      *HistoryContext
   594  	keyBuf  [60]byte // 52b key and 8b for inverted step
   595  	numBuf  [8]byte
   596  }
   597  
   598  func (dc *DomainContext) statelessGetter(i int) *compress.Getter {
   599  	if dc.getters == nil {
   600  		dc.getters = make([]*compress.Getter, len(dc.files))
   601  	}
   602  	r := dc.getters[i]
   603  	if r == nil {
   604  		r = dc.files[i].src.decompressor.MakeGetter()
   605  		dc.getters[i] = r
   606  	}
   607  	return r
   608  }
   609  
   610  func (dc *DomainContext) statelessBtree(i int) *BtIndex {
   611  	if dc.readers == nil {
   612  		dc.readers = make([]*BtIndex, len(dc.files))
   613  	}
   614  	r := dc.readers[i]
   615  	if r == nil {
   616  		r = dc.files[i].src.bindex
   617  		dc.readers[i] = r
   618  	}
   619  	return r
   620  }
   621  
   622  func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) {
   623  	d.History.files.Walk(func(items []*filesItem) bool {
   624  		for _, item := range items {
   625  			if item.index == nil {
   626  				return false
   627  			}
   628  			datsz += uint64(item.decompressor.Size())
   629  			idxsz += uint64(item.index.Size())
   630  			files += 2
   631  		}
   632  		return true
   633  	})
   634  
   635  	d.files.Walk(func(items []*filesItem) bool {
   636  		for _, item := range items {
   637  			if item.index == nil {
   638  				return false
   639  			}
   640  			datsz += uint64(item.decompressor.Size())
   641  			idxsz += uint64(item.index.Size())
   642  			idxsz += uint64(item.bindex.Size())
   643  			files += 3
   644  		}
   645  		return true
   646  	})
   647  
   648  	fcnt, fsz, isz := d.History.InvertedIndex.collectFilesStat()
   649  	datsz += fsz
   650  	files += fcnt
   651  	idxsz += isz
   652  	return
   653  }
   654  
   655  func (d *Domain) MakeContext() *DomainContext {
   656  	dc := &DomainContext{
   657  		d:     d,
   658  		hc:    d.History.MakeContext(),
   659  		files: *d.roFiles.Load(),
   660  	}
   661  	for _, item := range dc.files {
   662  		if !item.src.frozen {
   663  			item.src.refcount.Add(1)
   664  		}
   665  	}
   666  
   667  	return dc
   668  }
   669  
   670  // Collation is the set of compressors created after aggregation
   671  type Collation struct {
   672  	valuesComp   *compress.Compressor
   673  	historyComp  *compress.Compressor
   674  	indexBitmaps map[string]*roaring64.Bitmap
   675  	valuesPath   string
   676  	historyPath  string
   677  	valuesCount  int
   678  	historyCount int
   679  }
   680  
   681  func (c Collation) Close() {
   682  	if c.valuesComp != nil {
   683  		c.valuesComp.Close()
   684  	}
   685  	if c.historyComp != nil {
   686  		c.historyComp.Close()
   687  	}
   688  }
   689  
   690  type kvpair struct {
   691  	k, v []byte
   692  }
   693  
   694  func (d *Domain) writeCollationPair(valuesComp *compress.Compressor, pairs chan kvpair) (count int, err error) {
   695  	for kv := range pairs {
   696  		if err = valuesComp.AddUncompressedWord(kv.k); err != nil {
   697  			return count, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, kv.k, err)
   698  		}
   699  		mxCollationSize.Inc()
   700  		count++ // Only counting keys, not values
   701  		if err = valuesComp.AddUncompressedWord(kv.v); err != nil {
   702  			return count, fmt.Errorf("add %s values val [%x]=>[%x]: %w", d.filenameBase, kv.k, kv.v, err)
   703  		}
   704  	}
   705  	return count, nil
   706  }
   707  
   708  // nolint
   709  func (d *Domain) aggregate(ctx context.Context, step uint64, txFrom, txTo uint64, tx kv.Tx, ps *background.ProgressSet) (err error) {
   710  	mxRunningCollations.Inc()
   711  	start := time.Now()
   712  	collation, err := d.collateStream(ctx, step, txFrom, txTo, tx)
   713  	mxRunningCollations.Dec()
   714  	mxCollateTook.UpdateDuration(start)
   715  
   716  	mxCollationSize.Set(uint64(collation.valuesComp.Count()))
   717  	mxCollationSizeHist.Set(uint64(collation.historyComp.Count()))
   718  
   719  	if err != nil {
   720  		collation.Close()
   721  		//return fmt.Errorf("domain collation %q has failed: %w", d.filenameBase, err)
   722  		return err
   723  	}
   724  
   725  	mxRunningMerges.Inc()
   726  
   727  	start = time.Now()
   728  	sf, err := d.buildFiles(ctx, step, collation, ps)
   729  	collation.Close()
   730  	defer sf.Close()
   731  
   732  	if err != nil {
   733  		sf.Close()
   734  		mxRunningMerges.Dec()
   735  		return
   736  	}
   737  
   738  	mxRunningMerges.Dec()
   739  
   740  	d.integrateFiles(sf, step*d.aggregationStep, (step+1)*d.aggregationStep)
   741  	d.stats.LastFileBuildingTook = time.Since(start)
   742  	return nil
   743  }
   744  
   745  // collate gathers domain changes over the specified step, using read-only transaction,
   746  // and returns compressors, elias fano, and bitmaps
   747  // [txFrom; txTo)
   748  func (d *Domain) collateStream(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx) (Collation, error) {
   749  	started := time.Now()
   750  	defer func() {
   751  		d.stats.LastCollationTook = time.Since(started)
   752  	}()
   753  
   754  	hCollation, err := d.History.collate(step, txFrom, txTo, roTx)
   755  	if err != nil {
   756  		return Collation{}, err
   757  	}
   758  
   759  	var valuesComp *compress.Compressor
   760  	closeComp := true
   761  	defer func() {
   762  		if closeComp {
   763  			if valuesComp != nil {
   764  				valuesComp.Close()
   765  			}
   766  		}
   767  	}()
   768  
   769  	valuesPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1))
   770  	if valuesComp, err = compress.NewCompressor(context.Background(), "collate values", valuesPath, d.tmpdir, compress.MinPatternScore, 1, log.LvlTrace, d.logger); err != nil {
   771  		return Collation{}, fmt.Errorf("create %s values compressor: %w", d.filenameBase, err)
   772  	}
   773  
   774  	keysCursor, err := roTx.CursorDupSort(d.keysTable)
   775  	if err != nil {
   776  		return Collation{}, fmt.Errorf("create %s keys cursor: %w", d.filenameBase, err)
   777  	}
   778  	defer keysCursor.Close()
   779  
   780  	var (
   781  		k, v     []byte
   782  		pos      uint64
   783  		valCount int
   784  		pairs    = make(chan kvpair, 1024)
   785  	)
   786  
   787  	//totalKeys, err := keysCursor.Count()
   788  	//if err != nil {
   789  	//	return Collation{}, fmt.Errorf("failed to obtain keys count for domain %q", d.filenameBase)
   790  	//}
   791  
   792  	eg, _ := errgroup.WithContext(ctx)
   793  	eg.Go(func() error {
   794  		valCount, err = d.writeCollationPair(valuesComp, pairs)
   795  		return err
   796  	})
   797  
   798  	var (
   799  		stepBytes = make([]byte, 8)
   800  		keySuffix = make([]byte, 256+8)
   801  	)
   802  	binary.BigEndian.PutUint64(stepBytes, ^step)
   803  
   804  	for k, _, err = keysCursor.First(); err == nil && k != nil; k, _, err = keysCursor.NextNoDup() {
   805  		pos++
   806  
   807  		if v, err = keysCursor.LastDup(); err != nil {
   808  			return Collation{}, fmt.Errorf("find last %s key for aggregation step k=[%x]: %w", d.filenameBase, k, err)
   809  		}
   810  		if bytes.Equal(v, stepBytes) {
   811  			copy(keySuffix, k)
   812  			copy(keySuffix[len(k):], v)
   813  			ks := len(k) + len(v)
   814  
   815  			v, err := roTx.GetOne(d.valsTable, keySuffix[:ks])
   816  			if err != nil {
   817  				return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err)
   818  			}
   819  
   820  			select {
   821  			case <-ctx.Done():
   822  				return Collation{}, ctx.Err()
   823  			default:
   824  			}
   825  
   826  			pairs <- kvpair{k: k, v: v}
   827  		}
   828  	}
   829  	close(pairs)
   830  	if err != nil {
   831  		return Collation{}, fmt.Errorf("iterate over %s keys cursor: %w", d.filenameBase, err)
   832  	}
   833  
   834  	if err := eg.Wait(); err != nil {
   835  		return Collation{}, fmt.Errorf("collate over %s keys cursor: %w", d.filenameBase, err)
   836  	}
   837  
   838  	closeComp = false
   839  	return Collation{
   840  		valuesPath:   valuesPath,
   841  		valuesComp:   valuesComp,
   842  		valuesCount:  valCount,
   843  		historyPath:  hCollation.historyPath,
   844  		historyComp:  hCollation.historyComp,
   845  		historyCount: hCollation.historyCount,
   846  		indexBitmaps: hCollation.indexBitmaps,
   847  	}, nil
   848  }
   849  
   850  // collate gathers domain changes over the specified step, using read-only transaction,
   851  // and returns compressors, elias fano, and bitmaps
   852  // [txFrom; txTo)
   853  func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx, logEvery *time.Ticker) (Collation, error) {
   854  	started := time.Now()
   855  	defer func() {
   856  		d.stats.LastCollationTook = time.Since(started)
   857  	}()
   858  
   859  	hCollation, err := d.History.collate(step, txFrom, txTo, roTx)
   860  	if err != nil {
   861  		return Collation{}, err
   862  	}
   863  	var valuesComp *compress.Compressor
   864  	closeComp := true
   865  	defer func() {
   866  		if closeComp {
   867  			hCollation.Close()
   868  			if valuesComp != nil {
   869  				valuesComp.Close()
   870  			}
   871  		}
   872  	}()
   873  	valuesPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1))
   874  	if valuesComp, err = compress.NewCompressor(context.Background(), "collate values", valuesPath, d.tmpdir, compress.MinPatternScore, 1, log.LvlTrace, d.logger); err != nil {
   875  		return Collation{}, fmt.Errorf("create %s values compressor: %w", d.filenameBase, err)
   876  	}
   877  	keysCursor, err := roTx.CursorDupSort(d.keysTable)
   878  	if err != nil {
   879  		return Collation{}, fmt.Errorf("create %s keys cursor: %w", d.filenameBase, err)
   880  	}
   881  	defer keysCursor.Close()
   882  
   883  	var (
   884  		k, v        []byte
   885  		pos         uint64
   886  		valuesCount uint
   887  	)
   888  
   889  	//TODO: use prorgesSet
   890  	//totalKeys, err := keysCursor.Count()
   891  	//if err != nil {
   892  	//	return Collation{}, fmt.Errorf("failed to obtain keys count for domain %q", d.filenameBase)
   893  	//}
   894  	for k, _, err = keysCursor.First(); err == nil && k != nil; k, _, err = keysCursor.NextNoDup() {
   895  		if err != nil {
   896  			return Collation{}, err
   897  		}
   898  		pos++
   899  		select {
   900  		case <-ctx.Done():
   901  			d.logger.Warn("[snapshots] collate domain cancelled", "name", d.filenameBase, "err", ctx.Err())
   902  			return Collation{}, ctx.Err()
   903  		default:
   904  		}
   905  
   906  		if v, err = keysCursor.LastDup(); err != nil {
   907  			return Collation{}, fmt.Errorf("find last %s key for aggregation step k=[%x]: %w", d.filenameBase, k, err)
   908  		}
   909  		s := ^binary.BigEndian.Uint64(v)
   910  		if s == step {
   911  			keySuffix := make([]byte, len(k)+8)
   912  			copy(keySuffix, k)
   913  			copy(keySuffix[len(k):], v)
   914  			v, err := roTx.GetOne(d.valsTable, keySuffix)
   915  			if err != nil {
   916  				return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err)
   917  			}
   918  			if err = valuesComp.AddUncompressedWord(k); err != nil {
   919  				return Collation{}, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, k, err)
   920  			}
   921  			valuesCount++ // Only counting keys, not values
   922  			if err = valuesComp.AddUncompressedWord(v); err != nil {
   923  				return Collation{}, fmt.Errorf("add %s values val [%x]=>[%x]: %w", d.filenameBase, k, v, err)
   924  			}
   925  		}
   926  	}
   927  	if err != nil {
   928  		return Collation{}, fmt.Errorf("iterate over %s keys cursor: %w", d.filenameBase, err)
   929  	}
   930  	closeComp = false
   931  	return Collation{
   932  		valuesPath:   valuesPath,
   933  		valuesComp:   valuesComp,
   934  		valuesCount:  int(valuesCount),
   935  		historyPath:  hCollation.historyPath,
   936  		historyComp:  hCollation.historyComp,
   937  		historyCount: hCollation.historyCount,
   938  		indexBitmaps: hCollation.indexBitmaps,
   939  	}, nil
   940  }
   941  
   942  type StaticFiles struct {
   943  	valuesDecomp    *compress.Decompressor
   944  	valuesIdx       *recsplit.Index
   945  	valuesBt        *BtIndex
   946  	historyDecomp   *compress.Decompressor
   947  	historyIdx      *recsplit.Index
   948  	efHistoryDecomp *compress.Decompressor
   949  	efHistoryIdx    *recsplit.Index
   950  }
   951  
   952  func (sf StaticFiles) Close() {
   953  	if sf.valuesDecomp != nil {
   954  		sf.valuesDecomp.Close()
   955  	}
   956  	if sf.valuesIdx != nil {
   957  		sf.valuesIdx.Close()
   958  	}
   959  	if sf.valuesBt != nil {
   960  		sf.valuesBt.Close()
   961  	}
   962  	if sf.historyDecomp != nil {
   963  		sf.historyDecomp.Close()
   964  	}
   965  	if sf.historyIdx != nil {
   966  		sf.historyIdx.Close()
   967  	}
   968  	if sf.efHistoryDecomp != nil {
   969  		sf.efHistoryDecomp.Close()
   970  	}
   971  	if sf.efHistoryIdx != nil {
   972  		sf.efHistoryIdx.Close()
   973  	}
   974  }
   975  
   976  // buildFiles performs potentially resource intensive operations of creating
   977  // static files and their indices
   978  func (d *Domain) buildFiles(ctx context.Context, step uint64, collation Collation, ps *background.ProgressSet) (StaticFiles, error) {
   979  	hStaticFiles, err := d.History.buildFiles(ctx, step, HistoryCollation{
   980  		historyPath:  collation.historyPath,
   981  		historyComp:  collation.historyComp,
   982  		historyCount: collation.historyCount,
   983  		indexBitmaps: collation.indexBitmaps,
   984  	}, ps)
   985  	if err != nil {
   986  		return StaticFiles{}, err
   987  	}
   988  	valuesComp := collation.valuesComp
   989  	var valuesDecomp *compress.Decompressor
   990  	var valuesIdx *recsplit.Index
   991  	closeComp := true
   992  	defer func() {
   993  		if closeComp {
   994  			hStaticFiles.Close()
   995  			if valuesComp != nil {
   996  				valuesComp.Close()
   997  			}
   998  			if valuesDecomp != nil {
   999  				valuesDecomp.Close()
  1000  			}
  1001  			if valuesIdx != nil {
  1002  				valuesIdx.Close()
  1003  			}
  1004  		}
  1005  	}()
  1006  	if d.noFsync {
  1007  		valuesComp.DisableFsync()
  1008  	}
  1009  	if err = valuesComp.Compress(); err != nil {
  1010  		return StaticFiles{}, fmt.Errorf("compress %s values: %w", d.filenameBase, err)
  1011  	}
  1012  	valuesComp.Close()
  1013  	valuesComp = nil
  1014  	if valuesDecomp, err = compress.NewDecompressor(collation.valuesPath); err != nil {
  1015  		return StaticFiles{}, fmt.Errorf("open %s values decompressor: %w", d.filenameBase, err)
  1016  	}
  1017  
  1018  	valuesIdxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, step, step+1)
  1019  	valuesIdxPath := filepath.Join(d.dir, valuesIdxFileName)
  1020  	{
  1021  		p := ps.AddNew(valuesIdxFileName, uint64(valuesDecomp.Count()*2))
  1022  		defer ps.Delete(p)
  1023  		if valuesIdx, err = buildIndexThenOpen(ctx, valuesDecomp, valuesIdxPath, d.tmpdir, collation.valuesCount, false, p, d.logger, d.noFsync); err != nil {
  1024  			return StaticFiles{}, fmt.Errorf("build %s values idx: %w", d.filenameBase, err)
  1025  		}
  1026  	}
  1027  
  1028  	var bt *BtIndex
  1029  	{
  1030  		btFileName := strings.TrimSuffix(valuesIdxFileName, "kvi") + "bt"
  1031  		btPath := filepath.Join(d.dir, btFileName)
  1032  		p := ps.AddNew(btFileName, uint64(valuesDecomp.Count()*2))
  1033  		defer ps.Delete(p)
  1034  		bt, err = CreateBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesDecomp, p, d.tmpdir, d.logger)
  1035  		if err != nil {
  1036  			return StaticFiles{}, fmt.Errorf("build %s values bt idx: %w", d.filenameBase, err)
  1037  		}
  1038  	}
  1039  
  1040  	closeComp = false
  1041  	return StaticFiles{
  1042  		valuesDecomp:    valuesDecomp,
  1043  		valuesIdx:       valuesIdx,
  1044  		valuesBt:        bt,
  1045  		historyDecomp:   hStaticFiles.historyDecomp,
  1046  		historyIdx:      hStaticFiles.historyIdx,
  1047  		efHistoryDecomp: hStaticFiles.efHistoryDecomp,
  1048  		efHistoryIdx:    hStaticFiles.efHistoryIdx,
  1049  	}, nil
  1050  }
  1051  
  1052  func (d *Domain) missedIdxFiles() (l []*filesItem) {
  1053  	d.files.Walk(func(items []*filesItem) bool { // don't run slow logic while iterating on btree
  1054  		for _, item := range items {
  1055  			fromStep, toStep := item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep
  1056  			if !dir.FileExist(filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep))) {
  1057  				l = append(l, item)
  1058  			}
  1059  		}
  1060  		return true
  1061  	})
  1062  	return l
  1063  }
  1064  
  1065  // BuildMissedIndices - produce .efi/.vi/.kvi from .ef/.v/.kv
  1066  func (d *Domain) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps *background.ProgressSet) (err error) {
  1067  	d.History.BuildMissedIndices(ctx, g, ps)
  1068  	d.InvertedIndex.BuildMissedIndices(ctx, g, ps)
  1069  	for _, item := range d.missedIdxFiles() {
  1070  		//TODO: build .kvi
  1071  		fitem := item
  1072  		g.Go(func() error {
  1073  			idxPath := filepath.Join(fitem.decompressor.FilePath(), fitem.decompressor.FileName())
  1074  			idxPath = strings.TrimSuffix(idxPath, "kv") + "bt"
  1075  
  1076  			p := ps.AddNew("fixme", uint64(fitem.decompressor.Count()))
  1077  			defer ps.Delete(p)
  1078  			if err := BuildBtreeIndexWithDecompressor(idxPath, fitem.decompressor, p, d.tmpdir, d.logger); err != nil {
  1079  				return fmt.Errorf("failed to build btree index for %s:  %w", fitem.decompressor.FileName(), err)
  1080  			}
  1081  			return nil
  1082  		})
  1083  	}
  1084  	return nil
  1085  }
  1086  
  1087  func buildIndexThenOpen(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir string, count int, values bool, p *background.Progress, logger log.Logger, noFsync bool) (*recsplit.Index, error) {
  1088  	if err := buildIndex(ctx, d, idxPath, tmpdir, count, values, p, logger, noFsync); err != nil {
  1089  		return nil, err
  1090  	}
  1091  	return recsplit.OpenIndex(idxPath)
  1092  }
  1093  
  1094  func buildIndex(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir string, count int, values bool, p *background.Progress, logger log.Logger, noFsync bool) error {
  1095  	var rs *recsplit.RecSplit
  1096  	var err error
  1097  	if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{
  1098  		KeyCount:   count,
  1099  		Enums:      false,
  1100  		BucketSize: 2000,
  1101  		LeafSize:   8,
  1102  		TmpDir:     tmpdir,
  1103  		IndexFile:  idxPath,
  1104  	}, logger); err != nil {
  1105  		return fmt.Errorf("create recsplit: %w", err)
  1106  	}
  1107  	defer rs.Close()
  1108  	rs.LogLvl(log.LvlTrace)
  1109  	if noFsync {
  1110  		rs.DisableFsync()
  1111  	}
  1112  	defer d.EnableMadvNormal().DisableReadAhead()
  1113  
  1114  	word := make([]byte, 0, 256)
  1115  	var keyPos, valPos uint64
  1116  	g := d.MakeGetter()
  1117  	for {
  1118  		if err := ctx.Err(); err != nil {
  1119  			logger.Warn("recsplit index building cancelled", "err", err)
  1120  			return err
  1121  		}
  1122  		g.Reset(0)
  1123  		for g.HasNext() {
  1124  			word, valPos = g.Next(word[:0])
  1125  			if values {
  1126  				if err = rs.AddKey(word, valPos); err != nil {
  1127  					return fmt.Errorf("add idx key [%x]: %w", word, err)
  1128  				}
  1129  			} else {
  1130  				if err = rs.AddKey(word, keyPos); err != nil {
  1131  					return fmt.Errorf("add idx key [%x]: %w", word, err)
  1132  				}
  1133  			}
  1134  			// Skip value
  1135  			keyPos, _ = g.Skip()
  1136  
  1137  			p.Processed.Add(1)
  1138  		}
  1139  		if err = rs.Build(ctx); err != nil {
  1140  			if rs.Collision() {
  1141  				logger.Info("Building recsplit. Collision happened. It's ok. Restarting...")
  1142  				rs.ResetNextSalt()
  1143  			} else {
  1144  				return fmt.Errorf("build idx: %w", err)
  1145  			}
  1146  		} else {
  1147  			break
  1148  		}
  1149  	}
  1150  	return nil
  1151  }
  1152  
  1153  func (d *Domain) integrateFiles(sf StaticFiles, txNumFrom, txNumTo uint64) {
  1154  	d.History.integrateFiles(HistoryFiles{
  1155  		historyDecomp:   sf.historyDecomp,
  1156  		historyIdx:      sf.historyIdx,
  1157  		efHistoryDecomp: sf.efHistoryDecomp,
  1158  		efHistoryIdx:    sf.efHistoryIdx,
  1159  	}, txNumFrom, txNumTo)
  1160  
  1161  	fi := newFilesItem(txNumFrom, txNumTo, d.aggregationStep)
  1162  	fi.decompressor = sf.valuesDecomp
  1163  	fi.index = sf.valuesIdx
  1164  	fi.bindex = sf.valuesBt
  1165  	d.files.Set(fi)
  1166  
  1167  	d.reCalcRoFiles()
  1168  }
  1169  
  1170  // [txFrom; txTo)
  1171  func (d *Domain) prune(ctx context.Context, step uint64, txFrom, txTo, limit uint64, logEvery *time.Ticker) error {
  1172  	defer func(t time.Time) { d.stats.LastPruneTook = time.Since(t) }(time.Now())
  1173  	mxPruningProgress.Inc()
  1174  	defer mxPruningProgress.Dec()
  1175  
  1176  	var (
  1177  		_state    = "scan steps"
  1178  		pos       atomic.Uint64
  1179  		totalKeys uint64
  1180  	)
  1181  
  1182  	keysCursor, err := d.tx.RwCursorDupSort(d.keysTable)
  1183  	if err != nil {
  1184  		return fmt.Errorf("%s keys cursor: %w", d.filenameBase, err)
  1185  	}
  1186  	defer keysCursor.Close()
  1187  
  1188  	totalKeys, err = keysCursor.Count()
  1189  	if err != nil {
  1190  		return fmt.Errorf("get count of %s keys: %w", d.filenameBase, err)
  1191  	}
  1192  
  1193  	var (
  1194  		k, v, stepBytes []byte
  1195  		keyMaxSteps     = make(map[string]uint64)
  1196  		c               = 0
  1197  	)
  1198  	stepBytes = make([]byte, 8)
  1199  	binary.BigEndian.PutUint64(stepBytes, ^step)
  1200  
  1201  	for k, v, err = keysCursor.First(); err == nil && k != nil; k, v, err = keysCursor.Next() {
  1202  		if bytes.Equal(v, stepBytes) {
  1203  			c++
  1204  			kl, vl, err := keysCursor.PrevDup()
  1205  			if err != nil {
  1206  				break
  1207  			}
  1208  			if kl == nil && vl == nil {
  1209  				continue
  1210  			}
  1211  			s := ^binary.BigEndian.Uint64(vl)
  1212  			if s > step {
  1213  				_, vn, err := keysCursor.NextDup()
  1214  				if err != nil {
  1215  					break
  1216  				}
  1217  				if bytes.Equal(vn, stepBytes) {
  1218  					if err := keysCursor.DeleteCurrent(); err != nil {
  1219  						return fmt.Errorf("prune key %x: %w", k, err)
  1220  					}
  1221  					mxPruneSize.Inc()
  1222  					keyMaxSteps[string(k)] = s
  1223  				}
  1224  			}
  1225  		}
  1226  		pos.Add(1)
  1227  
  1228  		if ctx.Err() != nil {
  1229  			d.logger.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err())
  1230  			return ctx.Err()
  1231  		}
  1232  
  1233  		select {
  1234  		case <-ctx.Done():
  1235  			return ctx.Err()
  1236  		case <-logEvery.C:
  1237  			d.logger.Info("[snapshots] prune domain", "name", d.filenameBase,
  1238  				"stage", _state,
  1239  				"range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep)),
  1240  				"progress", fmt.Sprintf("%.2f%%", (float64(pos.Load())/float64(totalKeys))*100))
  1241  		default:
  1242  		}
  1243  	}
  1244  	if err != nil {
  1245  		return fmt.Errorf("iterate of %s keys: %w", d.filenameBase, err)
  1246  	}
  1247  
  1248  	pos.Store(0)
  1249  	// It is important to clean up tables in a specific order
  1250  	// First keysTable, because it is the first one access in the `get` function, i.e. if the record is deleted from there, other tables will not be accessed
  1251  	var valsCursor kv.RwCursor
  1252  	if valsCursor, err = d.tx.RwCursor(d.valsTable); err != nil {
  1253  		return fmt.Errorf("%s vals cursor: %w", d.filenameBase, err)
  1254  	}
  1255  	defer valsCursor.Close()
  1256  
  1257  	for k, _, err := valsCursor.First(); err == nil && k != nil; k, _, err = valsCursor.Next() {
  1258  		if bytes.HasSuffix(k, stepBytes) {
  1259  			if _, ok := keyMaxSteps[string(k)]; !ok {
  1260  				continue
  1261  			}
  1262  			if err := valsCursor.DeleteCurrent(); err != nil {
  1263  				return fmt.Errorf("prune val %x: %w", k, err)
  1264  			}
  1265  			mxPruneSize.Inc()
  1266  		}
  1267  		pos.Add(1)
  1268  		//_prog = 100 * (float64(pos) / float64(totalKeys))
  1269  
  1270  		select {
  1271  		case <-ctx.Done():
  1272  			return ctx.Err()
  1273  		case <-logEvery.C:
  1274  			d.logger.Info("[snapshots] prune domain", "name", d.filenameBase, "step", step)
  1275  			//"steps", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep)))
  1276  		default:
  1277  		}
  1278  	}
  1279  	if err != nil {
  1280  		return fmt.Errorf("iterate over %s vals: %w", d.filenameBase, err)
  1281  	}
  1282  
  1283  	defer func(t time.Time) { d.stats.LastPruneHistTook = time.Since(t) }(time.Now())
  1284  
  1285  	if err = d.History.prune(ctx, txFrom, txTo, limit, logEvery); err != nil {
  1286  		return fmt.Errorf("prune history at step %d [%d, %d): %w", step, txFrom, txTo, err)
  1287  	}
  1288  	return nil
  1289  }
  1290  
  1291  func (d *Domain) isEmpty(tx kv.Tx) (bool, error) {
  1292  	k, err := kv.FirstKey(tx, d.keysTable)
  1293  	if err != nil {
  1294  		return false, err
  1295  	}
  1296  	k2, err := kv.FirstKey(tx, d.valsTable)
  1297  	if err != nil {
  1298  		return false, err
  1299  	}
  1300  	isEmptyHist, err := d.History.isEmpty(tx)
  1301  	if err != nil {
  1302  		return false, err
  1303  	}
  1304  	return k == nil && k2 == nil && isEmptyHist, nil
  1305  }
  1306  
  1307  // nolint
  1308  func (d *Domain) warmup(ctx context.Context, txFrom, limit uint64, tx kv.Tx) error {
  1309  	domainKeysCursor, err := tx.CursorDupSort(d.keysTable)
  1310  	if err != nil {
  1311  		return fmt.Errorf("create %s domain cursor: %w", d.filenameBase, err)
  1312  	}
  1313  	defer domainKeysCursor.Close()
  1314  	var txKey [8]byte
  1315  	binary.BigEndian.PutUint64(txKey[:], txFrom)
  1316  	idxC, err := tx.CursorDupSort(d.keysTable)
  1317  	if err != nil {
  1318  		return err
  1319  	}
  1320  	defer idxC.Close()
  1321  	valsC, err := tx.Cursor(d.valsTable)
  1322  	if err != nil {
  1323  		return err
  1324  	}
  1325  	defer valsC.Close()
  1326  	k, v, err := domainKeysCursor.Seek(txKey[:])
  1327  	if err != nil {
  1328  		return err
  1329  	}
  1330  	if k == nil {
  1331  		return nil
  1332  	}
  1333  	txFrom = binary.BigEndian.Uint64(k)
  1334  	txTo := txFrom + d.aggregationStep
  1335  	if limit != math.MaxUint64 && limit != 0 {
  1336  		txTo = txFrom + limit
  1337  	}
  1338  	for ; err == nil && k != nil; k, v, err = domainKeysCursor.Next() {
  1339  		txNum := binary.BigEndian.Uint64(k)
  1340  		if txNum >= txTo {
  1341  			break
  1342  		}
  1343  		_, _, _ = valsC.Seek(v[len(v)-8:])
  1344  		_, _ = idxC.SeekBothRange(v[:len(v)-8], k)
  1345  
  1346  		select {
  1347  		case <-ctx.Done():
  1348  			return ctx.Err()
  1349  		default:
  1350  		}
  1351  	}
  1352  	if err != nil {
  1353  		return fmt.Errorf("iterate over %s domain keys: %w", d.filenameBase, err)
  1354  	}
  1355  
  1356  	return d.History.warmup(ctx, txFrom, limit, tx)
  1357  }
  1358  
  1359  var COMPARE_INDEXES = false // if true, will compare values from Btree and INvertedIndex
  1360  
  1361  func (dc *DomainContext) readFromFiles(filekey []byte, fromTxNum uint64) ([]byte, bool, error) {
  1362  	var val []byte
  1363  	var found bool
  1364  
  1365  	for i := len(dc.files) - 1; i >= 0; i-- {
  1366  		if dc.files[i].endTxNum < fromTxNum {
  1367  			break
  1368  		}
  1369  		reader := dc.statelessBtree(i)
  1370  		if reader.Empty() {
  1371  			continue
  1372  		}
  1373  		cur, err := reader.Seek(filekey)
  1374  		if err != nil {
  1375  			//return nil, false, nil //TODO: uncomment me
  1376  			return nil, false, err
  1377  		}
  1378  		if cur == nil {
  1379  			continue
  1380  		}
  1381  
  1382  		if bytes.Equal(cur.Key(), filekey) {
  1383  			val = cur.Value()
  1384  			found = true
  1385  			break
  1386  		}
  1387  	}
  1388  	return val, found, nil
  1389  }
  1390  
  1391  // historyBeforeTxNum searches history for a value of specified key before txNum
  1392  // second return value is true if the value is found in the history (even if it is nil)
  1393  func (dc *DomainContext) historyBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([]byte, bool, error) {
  1394  	dc.d.stats.HistoryQueries.Add(1)
  1395  
  1396  	v, found, err := dc.hc.GetNoState(key, txNum)
  1397  	if err != nil {
  1398  		return nil, false, err
  1399  	}
  1400  	if found {
  1401  		return v, true, nil
  1402  	}
  1403  
  1404  	var anyItem bool
  1405  	var topState ctxItem
  1406  	for _, item := range dc.hc.ic.files {
  1407  		if item.endTxNum < txNum {
  1408  			continue
  1409  		}
  1410  		anyItem = true
  1411  		topState = item
  1412  		break
  1413  	}
  1414  	if anyItem {
  1415  		// If there were no changes but there were history files, the value can be obtained from value files
  1416  		var val []byte
  1417  		for i := len(dc.files) - 1; i >= 0; i-- {
  1418  			if dc.files[i].startTxNum > topState.startTxNum {
  1419  				continue
  1420  			}
  1421  			reader := dc.statelessBtree(i)
  1422  			if reader.Empty() {
  1423  				continue
  1424  			}
  1425  			cur, err := reader.Seek(key)
  1426  			if err != nil {
  1427  				dc.d.logger.Warn("failed to read history before from file", "key", key, "err", err)
  1428  				return nil, false, err
  1429  			}
  1430  			if cur == nil {
  1431  				continue
  1432  			}
  1433  			if bytes.Equal(cur.Key(), key) {
  1434  				val = cur.Value()
  1435  				break
  1436  			}
  1437  		}
  1438  		return val, true, nil
  1439  	}
  1440  	// Value not found in history files, look in the recent history
  1441  	if roTx == nil {
  1442  		return nil, false, fmt.Errorf("roTx is nil")
  1443  	}
  1444  	return dc.hc.getNoStateFromDB(key, txNum, roTx)
  1445  }
  1446  
  1447  // GetBeforeTxNum does not always require usage of roTx. If it is possible to determine
  1448  // historical value based only on static files, roTx will not be used.
  1449  func (dc *DomainContext) GetBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([]byte, error) {
  1450  	v, hOk, err := dc.historyBeforeTxNum(key, txNum, roTx)
  1451  	if err != nil {
  1452  		return nil, err
  1453  	}
  1454  	if hOk {
  1455  		// if history returned marker of key creation
  1456  		// domain must return nil
  1457  		if len(v) == 0 {
  1458  			return nil, nil
  1459  		}
  1460  		return v, nil
  1461  	}
  1462  	if v, _, err = dc.get(key, txNum-1, roTx); err != nil {
  1463  		return nil, err
  1464  	}
  1465  	return v, nil
  1466  }
  1467  
  1468  func (dc *DomainContext) Close() {
  1469  	for _, item := range dc.files {
  1470  		if item.src.frozen {
  1471  			continue
  1472  		}
  1473  		refCnt := item.src.refcount.Add(-1)
  1474  		//GC: last reader responsible to remove useles files: close it and delete
  1475  		if refCnt == 0 && item.src.canDelete.Load() {
  1476  			item.src.closeFilesAndRemove()
  1477  		}
  1478  	}
  1479  	dc.hc.Close()
  1480  }
  1481  
  1482  // IteratePrefix iterates over key-value pairs of the domain that start with given prefix
  1483  // Such iteration is not intended to be used in public API, therefore it uses read-write transaction
  1484  // inside the domain. Another version of this for public API use needs to be created, that uses
  1485  // roTx instead and supports ending the iterations before it reaches the end.
  1486  func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) error {
  1487  	dc.d.stats.HistoryQueries.Add(1)
  1488  
  1489  	var cp CursorHeap
  1490  	heap.Init(&cp)
  1491  	var k, v []byte
  1492  	var err error
  1493  	keysCursor, err := dc.d.tx.CursorDupSort(dc.d.keysTable)
  1494  	if err != nil {
  1495  		return err
  1496  	}
  1497  	defer keysCursor.Close()
  1498  	if k, v, err = keysCursor.Seek(prefix); err != nil {
  1499  		return err
  1500  	}
  1501  	if bytes.HasPrefix(k, prefix) {
  1502  		keySuffix := make([]byte, len(k)+8)
  1503  		copy(keySuffix, k)
  1504  		copy(keySuffix[len(k):], v)
  1505  		step := ^binary.BigEndian.Uint64(v)
  1506  		txNum := step * dc.d.aggregationStep
  1507  		if v, err = dc.d.tx.GetOne(dc.d.valsTable, keySuffix); err != nil {
  1508  			return err
  1509  		}
  1510  		heap.Push(&cp, &CursorItem{t: DB_CURSOR, key: common.Copy(k), val: common.Copy(v), c: keysCursor, endTxNum: txNum, reverse: true})
  1511  	}
  1512  
  1513  	for i, item := range dc.files {
  1514  		bg := dc.statelessBtree(i)
  1515  		if bg.Empty() {
  1516  			continue
  1517  		}
  1518  
  1519  		cursor, err := bg.Seek(prefix)
  1520  		if err != nil {
  1521  			continue
  1522  		}
  1523  
  1524  		g := dc.statelessGetter(i)
  1525  		key := cursor.Key()
  1526  		if bytes.HasPrefix(key, prefix) {
  1527  			val := cursor.Value()
  1528  			heap.Push(&cp, &CursorItem{t: FILE_CURSOR, key: key, val: val, dg: g, endTxNum: item.endTxNum, reverse: true})
  1529  		}
  1530  	}
  1531  	for cp.Len() > 0 {
  1532  		lastKey := common.Copy(cp[0].key)
  1533  		lastVal := common.Copy(cp[0].val)
  1534  		// Advance all the items that have this key (including the top)
  1535  		for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) {
  1536  			ci1 := cp[0]
  1537  			switch ci1.t {
  1538  			case FILE_CURSOR:
  1539  				if ci1.dg.HasNext() {
  1540  					ci1.key, _ = ci1.dg.Next(ci1.key[:0])
  1541  					if bytes.HasPrefix(ci1.key, prefix) {
  1542  						ci1.val, _ = ci1.dg.Next(ci1.val[:0])
  1543  						heap.Fix(&cp, 0)
  1544  					} else {
  1545  						heap.Pop(&cp)
  1546  					}
  1547  				} else {
  1548  					heap.Pop(&cp)
  1549  				}
  1550  			case DB_CURSOR:
  1551  				k, v, err = ci1.c.NextNoDup()
  1552  				if err != nil {
  1553  					return err
  1554  				}
  1555  				if k != nil && bytes.HasPrefix(k, prefix) {
  1556  					ci1.key = common.Copy(k)
  1557  					keySuffix := make([]byte, len(k)+8)
  1558  					copy(keySuffix, k)
  1559  					copy(keySuffix[len(k):], v)
  1560  					if v, err = dc.d.tx.GetOne(dc.d.valsTable, keySuffix); err != nil {
  1561  						return err
  1562  					}
  1563  					ci1.val = common.Copy(v)
  1564  					heap.Fix(&cp, 0)
  1565  				} else {
  1566  					heap.Pop(&cp)
  1567  				}
  1568  			}
  1569  		}
  1570  		if len(lastVal) > 0 {
  1571  			it(lastKey, lastVal)
  1572  		}
  1573  	}
  1574  	return nil
  1575  }