github.com/ledgerwatch/erigon-lib@v1.0.0/etl/etl.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package etl
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"reflect"
    23  	"time"
    24  
    25  	"github.com/c2h5oh/datasize"
    26  	"github.com/ledgerwatch/erigon-lib/common"
    27  	"github.com/ledgerwatch/erigon-lib/kv"
    28  	"github.com/ledgerwatch/log/v3"
    29  )
    30  
    31  type CurrentTableReader interface {
    32  	Get([]byte) ([]byte, error)
    33  }
    34  
    35  type ExtractNextFunc func(originalK, k []byte, v []byte) error
    36  type ExtractFunc func(k []byte, v []byte, next ExtractNextFunc) error
    37  
    38  // NextKey generates the possible next key w/o changing the key length.
    39  // for [0x01, 0x01, 0x01] it will generate [0x01, 0x01, 0x02], etc
    40  func NextKey(key []byte) ([]byte, error) {
    41  	if len(key) == 0 {
    42  		return key, fmt.Errorf("could not apply NextKey for the empty key")
    43  	}
    44  	nextKey := common.Copy(key)
    45  	for i := len(key) - 1; i >= 0; i-- {
    46  		b := nextKey[i]
    47  		if b < 0xFF {
    48  			nextKey[i] = b + 1
    49  			return nextKey, nil
    50  		}
    51  		if b == 0xFF {
    52  			nextKey[i] = 0
    53  		}
    54  	}
    55  	return key, fmt.Errorf("overflow while applying NextKey")
    56  }
    57  
    58  // LoadCommitHandler is a callback called each time a new batch is being
    59  // loaded from files into a DB
    60  // * `key`: last commited key to the database (use etl.NextKey helper to use in LoadStartKey)
    61  // * `isDone`: true, if everything is processed
    62  type LoadCommitHandler func(db kv.Putter, key []byte, isDone bool) error
    63  type AdditionalLogArguments func(k, v []byte) (additionalLogArguments []interface{})
    64  
    65  type TransformArgs struct {
    66  	Quit              <-chan struct{}
    67  	LogDetailsExtract AdditionalLogArguments
    68  	LogDetailsLoad    AdditionalLogArguments
    69  	// [ExtractStartKey, ExtractEndKey)
    70  	ExtractStartKey []byte
    71  	ExtractEndKey   []byte
    72  	BufferType      int
    73  	BufferSize      int
    74  }
    75  
    76  func Transform(
    77  	logPrefix string,
    78  	db kv.RwTx,
    79  	fromBucket string,
    80  	toBucket string,
    81  	tmpdir string,
    82  	extractFunc ExtractFunc,
    83  	loadFunc LoadFunc,
    84  	args TransformArgs,
    85  	logger log.Logger,
    86  ) error {
    87  	bufferSize := BufferOptimalSize
    88  	if args.BufferSize > 0 {
    89  		bufferSize = datasize.ByteSize(args.BufferSize)
    90  	}
    91  	buffer := getBufferByType(args.BufferType, bufferSize)
    92  	collector := NewCollector(logPrefix, tmpdir, buffer, logger)
    93  	defer collector.Close()
    94  
    95  	t := time.Now()
    96  	if err := extractBucketIntoFiles(logPrefix, db, fromBucket, args.ExtractStartKey, args.ExtractEndKey, collector, extractFunc, args.Quit, args.LogDetailsExtract, logger); err != nil {
    97  		return err
    98  	}
    99  	logger.Trace(fmt.Sprintf("[%s] Extraction finished", logPrefix), "took", time.Since(t))
   100  
   101  	defer func(t time.Time) {
   102  		logger.Trace(fmt.Sprintf("[%s] Load finished", logPrefix), "took", time.Since(t))
   103  	}(time.Now())
   104  	return collector.Load(db, toBucket, loadFunc, args)
   105  }
   106  
   107  // extractBucketIntoFiles - [startkey, endkey)
   108  func extractBucketIntoFiles(
   109  	logPrefix string,
   110  	db kv.Tx,
   111  	bucket string,
   112  	startkey []byte,
   113  	endkey []byte,
   114  	collector *Collector,
   115  	extractFunc ExtractFunc,
   116  	quit <-chan struct{},
   117  	additionalLogArguments AdditionalLogArguments,
   118  	logger log.Logger,
   119  ) error {
   120  	logEvery := time.NewTicker(30 * time.Second)
   121  	defer logEvery.Stop()
   122  
   123  	c, err := db.Cursor(bucket)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	defer c.Close()
   128  	for k, v, e := c.Seek(startkey); k != nil; k, v, e = c.Next() {
   129  		if e != nil {
   130  			return e
   131  		}
   132  		if err := common.Stopped(quit); err != nil {
   133  			return err
   134  		}
   135  		select {
   136  		default:
   137  		case <-logEvery.C:
   138  			logArs := []interface{}{"from", bucket}
   139  			if additionalLogArguments != nil {
   140  				logArs = append(logArs, additionalLogArguments(k, v)...)
   141  			} else {
   142  				logArs = append(logArs, "current_prefix", makeCurrentKeyStr(k))
   143  			}
   144  
   145  			logger.Info(fmt.Sprintf("[%s] ETL [1/2] Extracting", logPrefix), logArs...)
   146  		}
   147  		if endkey != nil && bytes.Compare(k, endkey) >= 0 {
   148  			// endKey is exclusive bound: [startkey, endkey)
   149  			return nil
   150  		}
   151  		if err := extractFunc(k, v, collector.extractNextFunc); err != nil {
   152  			return err
   153  		}
   154  	}
   155  	return collector.flushBuffer(true)
   156  }
   157  
   158  type currentTableReader struct {
   159  	getter kv.Tx
   160  	bucket string
   161  }
   162  
   163  func (s *currentTableReader) Get(key []byte) ([]byte, error) {
   164  	return s.getter.GetOne(s.bucket, key)
   165  }
   166  
   167  // IdentityLoadFunc loads entries as they are, without transformation
   168  var IdentityLoadFunc LoadFunc = func(k []byte, value []byte, _ CurrentTableReader, next LoadNextFunc) error {
   169  	return next(k, k, value)
   170  }
   171  
   172  func isIdentityLoadFunc(f LoadFunc) bool {
   173  	return f == nil || reflect.ValueOf(IdentityLoadFunc).Pointer() == reflect.ValueOf(f).Pointer()
   174  }