github.com/ledgerwatch/erigon-lib@v1.0.0/etl/etl.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etl 18 19 import ( 20 "bytes" 21 "fmt" 22 "reflect" 23 "time" 24 25 "github.com/c2h5oh/datasize" 26 "github.com/ledgerwatch/erigon-lib/common" 27 "github.com/ledgerwatch/erigon-lib/kv" 28 "github.com/ledgerwatch/log/v3" 29 ) 30 31 type CurrentTableReader interface { 32 Get([]byte) ([]byte, error) 33 } 34 35 type ExtractNextFunc func(originalK, k []byte, v []byte) error 36 type ExtractFunc func(k []byte, v []byte, next ExtractNextFunc) error 37 38 // NextKey generates the possible next key w/o changing the key length. 39 // for [0x01, 0x01, 0x01] it will generate [0x01, 0x01, 0x02], etc 40 func NextKey(key []byte) ([]byte, error) { 41 if len(key) == 0 { 42 return key, fmt.Errorf("could not apply NextKey for the empty key") 43 } 44 nextKey := common.Copy(key) 45 for i := len(key) - 1; i >= 0; i-- { 46 b := nextKey[i] 47 if b < 0xFF { 48 nextKey[i] = b + 1 49 return nextKey, nil 50 } 51 if b == 0xFF { 52 nextKey[i] = 0 53 } 54 } 55 return key, fmt.Errorf("overflow while applying NextKey") 56 } 57 58 // LoadCommitHandler is a callback called each time a new batch is being 59 // loaded from files into a DB 60 // * `key`: last commited key to the database (use etl.NextKey helper to use in LoadStartKey) 61 // * `isDone`: true, if everything is processed 62 type LoadCommitHandler func(db kv.Putter, key []byte, isDone bool) error 63 type AdditionalLogArguments func(k, v []byte) (additionalLogArguments []interface{}) 64 65 type TransformArgs struct { 66 Quit <-chan struct{} 67 LogDetailsExtract AdditionalLogArguments 68 LogDetailsLoad AdditionalLogArguments 69 // [ExtractStartKey, ExtractEndKey) 70 ExtractStartKey []byte 71 ExtractEndKey []byte 72 BufferType int 73 BufferSize int 74 } 75 76 func Transform( 77 logPrefix string, 78 db kv.RwTx, 79 fromBucket string, 80 toBucket string, 81 tmpdir string, 82 extractFunc ExtractFunc, 83 loadFunc LoadFunc, 84 args TransformArgs, 85 logger log.Logger, 86 ) error { 87 bufferSize := BufferOptimalSize 88 if args.BufferSize > 0 { 89 bufferSize = datasize.ByteSize(args.BufferSize) 90 } 91 buffer := getBufferByType(args.BufferType, bufferSize) 92 collector := NewCollector(logPrefix, tmpdir, buffer, logger) 93 defer collector.Close() 94 95 t := time.Now() 96 if err := extractBucketIntoFiles(logPrefix, db, fromBucket, args.ExtractStartKey, args.ExtractEndKey, collector, extractFunc, args.Quit, args.LogDetailsExtract, logger); err != nil { 97 return err 98 } 99 logger.Trace(fmt.Sprintf("[%s] Extraction finished", logPrefix), "took", time.Since(t)) 100 101 defer func(t time.Time) { 102 logger.Trace(fmt.Sprintf("[%s] Load finished", logPrefix), "took", time.Since(t)) 103 }(time.Now()) 104 return collector.Load(db, toBucket, loadFunc, args) 105 } 106 107 // extractBucketIntoFiles - [startkey, endkey) 108 func extractBucketIntoFiles( 109 logPrefix string, 110 db kv.Tx, 111 bucket string, 112 startkey []byte, 113 endkey []byte, 114 collector *Collector, 115 extractFunc ExtractFunc, 116 quit <-chan struct{}, 117 additionalLogArguments AdditionalLogArguments, 118 logger log.Logger, 119 ) error { 120 logEvery := time.NewTicker(30 * time.Second) 121 defer logEvery.Stop() 122 123 c, err := db.Cursor(bucket) 124 if err != nil { 125 return err 126 } 127 defer c.Close() 128 for k, v, e := c.Seek(startkey); k != nil; k, v, e = c.Next() { 129 if e != nil { 130 return e 131 } 132 if err := common.Stopped(quit); err != nil { 133 return err 134 } 135 select { 136 default: 137 case <-logEvery.C: 138 logArs := []interface{}{"from", bucket} 139 if additionalLogArguments != nil { 140 logArs = append(logArs, additionalLogArguments(k, v)...) 141 } else { 142 logArs = append(logArs, "current_prefix", makeCurrentKeyStr(k)) 143 } 144 145 logger.Info(fmt.Sprintf("[%s] ETL [1/2] Extracting", logPrefix), logArs...) 146 } 147 if endkey != nil && bytes.Compare(k, endkey) >= 0 { 148 // endKey is exclusive bound: [startkey, endkey) 149 return nil 150 } 151 if err := extractFunc(k, v, collector.extractNextFunc); err != nil { 152 return err 153 } 154 } 155 return collector.flushBuffer(true) 156 } 157 158 type currentTableReader struct { 159 getter kv.Tx 160 bucket string 161 } 162 163 func (s *currentTableReader) Get(key []byte) ([]byte, error) { 164 return s.getter.GetOne(s.bucket, key) 165 } 166 167 // IdentityLoadFunc loads entries as they are, without transformation 168 var IdentityLoadFunc LoadFunc = func(k []byte, value []byte, _ CurrentTableReader, next LoadNextFunc) error { 169 return next(k, k, value) 170 } 171 172 func isIdentityLoadFunc(f LoadFunc) bool { 173 return f == nil || reflect.ValueOf(IdentityLoadFunc).Pointer() == reflect.ValueOf(f).Pointer() 174 }