github.com/ledgerwatch/erigon-lib@v1.0.0/etl/collector.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package etl
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/hex"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"os"
    26  	"path/filepath"
    27  	"time"
    28  
    29  	"github.com/c2h5oh/datasize"
    30  	"github.com/ledgerwatch/log/v3"
    31  
    32  	"github.com/ledgerwatch/erigon-lib/common"
    33  	"github.com/ledgerwatch/erigon-lib/kv"
    34  )
    35  
    36  type LoadNextFunc func(originalK, k, v []byte) error
    37  type LoadFunc func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error
    38  type simpleLoadFunc func(k, v []byte) error
    39  
    40  // Collector performs the job of ETL Transform, but can also be used without "E" (Extract) part
    41  // as a Collect Transform Load
    42  type Collector struct {
    43  	buf           Buffer
    44  	logPrefix     string
    45  	tmpdir        string
    46  	dataProviders []dataProvider
    47  	logLvl        log.Lvl
    48  	bufType       int
    49  	allFlushed    bool
    50  	autoClean     bool
    51  	logger        log.Logger
    52  }
    53  
    54  // NewCollectorFromFiles creates collector from existing files (left over from previous unsuccessful loading)
    55  func NewCollectorFromFiles(logPrefix, tmpdir string, logger log.Logger) (*Collector, error) {
    56  	if _, err := os.Stat(tmpdir); os.IsNotExist(err) {
    57  		return nil, nil
    58  	}
    59  	dirEntries, err := os.ReadDir(tmpdir)
    60  	if err != nil {
    61  		return nil, fmt.Errorf("collector from files - reading directory %s: %w", tmpdir, err)
    62  	}
    63  	if len(dirEntries) == 0 {
    64  		return nil, nil
    65  	}
    66  	dataProviders := make([]dataProvider, len(dirEntries))
    67  	for i, dirEntry := range dirEntries {
    68  		fileInfo, err := dirEntry.Info()
    69  		if err != nil {
    70  			return nil, fmt.Errorf("collector from files - reading file info %s: %w", dirEntry.Name(), err)
    71  		}
    72  		var dataProvider fileDataProvider
    73  		dataProvider.file, err = os.Open(filepath.Join(tmpdir, fileInfo.Name()))
    74  		if err != nil {
    75  			return nil, fmt.Errorf("collector from files - opening file %s: %w", fileInfo.Name(), err)
    76  		}
    77  		dataProviders[i] = &dataProvider
    78  	}
    79  	return &Collector{dataProviders: dataProviders, allFlushed: true, autoClean: false, logPrefix: logPrefix}, nil
    80  }
    81  
    82  // NewCriticalCollector does not clean up temporary files if loading has failed
    83  func NewCriticalCollector(logPrefix, tmpdir string, sortableBuffer Buffer, logger log.Logger) *Collector {
    84  	c := NewCollector(logPrefix, tmpdir, sortableBuffer, logger)
    85  	c.autoClean = false
    86  	return c
    87  }
    88  
    89  func NewCollector(logPrefix, tmpdir string, sortableBuffer Buffer, logger log.Logger) *Collector {
    90  	return &Collector{autoClean: true, bufType: getTypeByBuffer(sortableBuffer), buf: sortableBuffer, logPrefix: logPrefix, tmpdir: tmpdir, logLvl: log.LvlInfo, logger: logger}
    91  }
    92  
    93  func (c *Collector) extractNextFunc(originalK, k []byte, v []byte) error {
    94  	c.buf.Put(k, v)
    95  	if !c.buf.CheckFlushSize() {
    96  		return nil
    97  	}
    98  	return c.flushBuffer(false)
    99  }
   100  
   101  func (c *Collector) Collect(k, v []byte) error {
   102  	return c.extractNextFunc(k, k, v)
   103  }
   104  
   105  func (c *Collector) LogLvl(v log.Lvl) { c.logLvl = v }
   106  
   107  func (c *Collector) flushBuffer(canStoreInRam bool) error {
   108  	if c.buf.Len() == 0 {
   109  		return nil
   110  	}
   111  
   112  	var provider dataProvider
   113  	if canStoreInRam && len(c.dataProviders) == 0 {
   114  		c.buf.Sort()
   115  		provider = KeepInRAM(c.buf)
   116  		c.allFlushed = true
   117  	} else {
   118  		fullBuf := c.buf
   119  		prevLen, prevSize := fullBuf.Len(), fullBuf.SizeLimit()
   120  		c.buf = getBufferByType(c.bufType, datasize.ByteSize(c.buf.SizeLimit()))
   121  
   122  		doFsync := !c.autoClean /* is critical collector */
   123  		var err error
   124  		provider, err = FlushToDisk(c.logPrefix, fullBuf, c.tmpdir, doFsync, c.logLvl)
   125  		if err != nil {
   126  			return err
   127  		}
   128  		c.buf.Prealloc(prevLen/8, prevSize/8)
   129  	}
   130  	if provider != nil {
   131  		c.dataProviders = append(c.dataProviders, provider)
   132  	}
   133  	return nil
   134  }
   135  
   136  // Flush - an optional method (usually user don't need to call it) - forcing sort+flush current buffer.
   137  // it does trigger background sort and flush, reducing RAM-holding, etc...
   138  // it's useful when working with many collectors: to trigger background sort for all of them
   139  func (c *Collector) Flush() error {
   140  	if !c.allFlushed {
   141  		if e := c.flushBuffer(false); e != nil {
   142  			return e
   143  		}
   144  	}
   145  	return nil
   146  }
   147  
   148  func (c *Collector) Load(db kv.RwTx, toBucket string, loadFunc LoadFunc, args TransformArgs) error {
   149  	if c.autoClean {
   150  		defer c.Close()
   151  	}
   152  
   153  	if !c.allFlushed {
   154  		if e := c.flushBuffer(true); e != nil {
   155  			return e
   156  		}
   157  	}
   158  
   159  	bucket := toBucket
   160  
   161  	var cursor kv.RwCursor
   162  	haveSortingGuaranties := isIdentityLoadFunc(loadFunc) // user-defined loadFunc may change ordering
   163  	var lastKey []byte
   164  	if bucket != "" { // passing empty bucket name is valid case for etl when DB modification is not expected
   165  		var err error
   166  		cursor, err = db.RwCursor(bucket)
   167  		if err != nil {
   168  			return err
   169  		}
   170  		var errLast error
   171  		lastKey, _, errLast = cursor.Last()
   172  		if errLast != nil {
   173  			return errLast
   174  		}
   175  	}
   176  
   177  	var canUseAppend bool
   178  	isDupSort := kv.ChaindataTablesCfg[bucket].Flags&kv.DupSort != 0 && !kv.ChaindataTablesCfg[bucket].AutoDupSortKeysConversion
   179  
   180  	logEvery := time.NewTicker(30 * time.Second)
   181  	defer logEvery.Stop()
   182  
   183  	i := 0
   184  	var prevK []byte
   185  	loadNextFunc := func(_, k, v []byte) error {
   186  		if i == 0 {
   187  			isEndOfBucket := lastKey == nil || bytes.Compare(lastKey, k) == -1
   188  			canUseAppend = haveSortingGuaranties && isEndOfBucket
   189  		}
   190  		i++
   191  
   192  		// SortableOldestAppearedBuffer must guarantee that only 1 oldest value of key will appear
   193  		// but because size of buffer is limited - each flushed file does guarantee "oldest appeared"
   194  		// property, but files may overlap. files are sorted, just skip repeated keys here
   195  		if c.bufType == SortableOldestAppearedBuffer {
   196  			if bytes.Equal(prevK, k) {
   197  				return nil
   198  			} else {
   199  				// Need to copy k because the underlying space will be re-used for the next key
   200  				prevK = common.Copy(k)
   201  			}
   202  		}
   203  
   204  		select {
   205  		default:
   206  		case <-logEvery.C:
   207  			logArs := []interface{}{"into", bucket}
   208  			if args.LogDetailsLoad != nil {
   209  				logArs = append(logArs, args.LogDetailsLoad(k, v)...)
   210  			} else {
   211  				logArs = append(logArs, "current_prefix", makeCurrentKeyStr(k))
   212  			}
   213  
   214  			c.logger.Log(c.logLvl, fmt.Sprintf("[%s] ETL [2/2] Loading", c.logPrefix), logArs...)
   215  		}
   216  
   217  		isNil := (c.bufType == SortableSliceBuffer && v == nil) ||
   218  			(c.bufType == SortableAppendBuffer && len(v) == 0) || //backward compatibility
   219  			(c.bufType == SortableOldestAppearedBuffer && len(v) == 0)
   220  		if isNil {
   221  			if canUseAppend {
   222  				return nil // nothing to delete after end of bucket
   223  			}
   224  			if err := cursor.Delete(k); err != nil {
   225  				return err
   226  			}
   227  			return nil
   228  		}
   229  		if canUseAppend {
   230  			if isDupSort {
   231  				if err := cursor.(kv.RwCursorDupSort).AppendDup(k, v); err != nil {
   232  					return fmt.Errorf("%s: bucket: %s, appendDup: k=%x, %w", c.logPrefix, bucket, k, err)
   233  				}
   234  			} else {
   235  				if err := cursor.Append(k, v); err != nil {
   236  					return fmt.Errorf("%s: bucket: %s, append: k=%x, v=%x, %w", c.logPrefix, bucket, k, v, err)
   237  				}
   238  			}
   239  
   240  			return nil
   241  		}
   242  		if err := cursor.Put(k, v); err != nil {
   243  			return fmt.Errorf("%s: put: k=%x, %w", c.logPrefix, k, err)
   244  		}
   245  		return nil
   246  	}
   247  
   248  	currentTable := &currentTableReader{db, bucket}
   249  	simpleLoad := func(k, v []byte) error {
   250  		return loadFunc(k, v, currentTable, loadNextFunc)
   251  	}
   252  	if err := mergeSortFiles(c.logPrefix, c.dataProviders, simpleLoad, args); err != nil {
   253  		return fmt.Errorf("loadIntoTable %s: %w", toBucket, err)
   254  	}
   255  	//logger.Trace(fmt.Sprintf("[%s] ETL Load done", c.logPrefix), "bucket", bucket, "records", i)
   256  	return nil
   257  }
   258  
   259  func (c *Collector) reset() {
   260  	if c.dataProviders != nil {
   261  		for _, p := range c.dataProviders {
   262  			p.Dispose()
   263  		}
   264  		c.dataProviders = nil
   265  	}
   266  	c.buf.Reset()
   267  	c.allFlushed = false
   268  }
   269  
   270  func (c *Collector) Close() {
   271  	c.reset()
   272  }
   273  
   274  // mergeSortFiles uses merge-sort to order the elements stored within the slice of providers,
   275  // regardless of ordering within the files the elements will be processed in order.
   276  // The first pass reads the first element from each of the providers and populates a heap with the key/value/provider index.
   277  // Later, the heap is popped to get the first element, the record is processed using the LoadFunc, and the provider is asked
   278  // for the next item, which is then added back to the heap.
   279  // The subsequent iterations pop the heap again and load up the provider associated with it to get the next element after processing LoadFunc.
   280  // this continues until all providers have reached their EOF.
   281  func mergeSortFiles(logPrefix string, providers []dataProvider, loadFunc simpleLoadFunc, args TransformArgs) error {
   282  	for _, provider := range providers {
   283  		if err := provider.Wait(); err != nil {
   284  			return err
   285  		}
   286  	}
   287  
   288  	h := &Heap{}
   289  	heapInit(h)
   290  	for i, provider := range providers {
   291  		if key, value, err := provider.Next(nil, nil); err == nil {
   292  			heapPush(h, &HeapElem{key, value, i})
   293  		} else /* we must have at least one entry per file */ {
   294  			eee := fmt.Errorf("%s: error reading first readers: n=%d current=%d provider=%s err=%w",
   295  				logPrefix, len(providers), i, provider, err)
   296  			panic(eee)
   297  		}
   298  	}
   299  
   300  	// Main loading loop
   301  	for h.Len() > 0 {
   302  		if err := common.Stopped(args.Quit); err != nil {
   303  			return err
   304  		}
   305  
   306  		element := heapPop(h)
   307  		provider := providers[element.TimeIdx]
   308  		err := loadFunc(element.Key, element.Value)
   309  		if err != nil {
   310  			return err
   311  		}
   312  		if element.Key, element.Value, err = provider.Next(element.Key[:0], element.Value[:0]); err == nil {
   313  			heapPush(h, element)
   314  		} else if !errors.Is(err, io.EOF) {
   315  			return fmt.Errorf("%s: error while reading next element from disk: %w", logPrefix, err)
   316  		}
   317  	}
   318  	return nil
   319  }
   320  
   321  func makeCurrentKeyStr(k []byte) string {
   322  	var currentKeyStr string
   323  	if k == nil {
   324  		currentKeyStr = "final"
   325  	} else if len(k) < 4 {
   326  		currentKeyStr = hex.EncodeToString(k)
   327  	} else if k[0] == 0 && k[1] == 0 && k[2] == 0 && k[3] == 0 && len(k) >= 8 { // if key has leading zeroes, show a bit more info
   328  		currentKeyStr = hex.EncodeToString(k)
   329  	} else {
   330  		currentKeyStr = hex.EncodeToString(k[:4])
   331  	}
   332  	return currentKeyStr
   333  }