github.com/ledgerwatch/erigon-lib@v1.0.0/etl/collector.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etl 18 19 import ( 20 "bytes" 21 "encoding/hex" 22 "errors" 23 "fmt" 24 "io" 25 "os" 26 "path/filepath" 27 "time" 28 29 "github.com/c2h5oh/datasize" 30 "github.com/ledgerwatch/log/v3" 31 32 "github.com/ledgerwatch/erigon-lib/common" 33 "github.com/ledgerwatch/erigon-lib/kv" 34 ) 35 36 type LoadNextFunc func(originalK, k, v []byte) error 37 type LoadFunc func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error 38 type simpleLoadFunc func(k, v []byte) error 39 40 // Collector performs the job of ETL Transform, but can also be used without "E" (Extract) part 41 // as a Collect Transform Load 42 type Collector struct { 43 buf Buffer 44 logPrefix string 45 tmpdir string 46 dataProviders []dataProvider 47 logLvl log.Lvl 48 bufType int 49 allFlushed bool 50 autoClean bool 51 logger log.Logger 52 } 53 54 // NewCollectorFromFiles creates collector from existing files (left over from previous unsuccessful loading) 55 func NewCollectorFromFiles(logPrefix, tmpdir string, logger log.Logger) (*Collector, error) { 56 if _, err := os.Stat(tmpdir); os.IsNotExist(err) { 57 return nil, nil 58 } 59 dirEntries, err := os.ReadDir(tmpdir) 60 if err != nil { 61 return nil, fmt.Errorf("collector from files - reading directory %s: %w", tmpdir, err) 62 } 63 if len(dirEntries) == 0 { 64 return nil, nil 65 } 66 dataProviders := make([]dataProvider, len(dirEntries)) 67 for i, dirEntry := range dirEntries { 68 fileInfo, err := dirEntry.Info() 69 if err != nil { 70 return nil, fmt.Errorf("collector from files - reading file info %s: %w", dirEntry.Name(), err) 71 } 72 var dataProvider fileDataProvider 73 dataProvider.file, err = os.Open(filepath.Join(tmpdir, fileInfo.Name())) 74 if err != nil { 75 return nil, fmt.Errorf("collector from files - opening file %s: %w", fileInfo.Name(), err) 76 } 77 dataProviders[i] = &dataProvider 78 } 79 return &Collector{dataProviders: dataProviders, allFlushed: true, autoClean: false, logPrefix: logPrefix}, nil 80 } 81 82 // NewCriticalCollector does not clean up temporary files if loading has failed 83 func NewCriticalCollector(logPrefix, tmpdir string, sortableBuffer Buffer, logger log.Logger) *Collector { 84 c := NewCollector(logPrefix, tmpdir, sortableBuffer, logger) 85 c.autoClean = false 86 return c 87 } 88 89 func NewCollector(logPrefix, tmpdir string, sortableBuffer Buffer, logger log.Logger) *Collector { 90 return &Collector{autoClean: true, bufType: getTypeByBuffer(sortableBuffer), buf: sortableBuffer, logPrefix: logPrefix, tmpdir: tmpdir, logLvl: log.LvlInfo, logger: logger} 91 } 92 93 func (c *Collector) extractNextFunc(originalK, k []byte, v []byte) error { 94 c.buf.Put(k, v) 95 if !c.buf.CheckFlushSize() { 96 return nil 97 } 98 return c.flushBuffer(false) 99 } 100 101 func (c *Collector) Collect(k, v []byte) error { 102 return c.extractNextFunc(k, k, v) 103 } 104 105 func (c *Collector) LogLvl(v log.Lvl) { c.logLvl = v } 106 107 func (c *Collector) flushBuffer(canStoreInRam bool) error { 108 if c.buf.Len() == 0 { 109 return nil 110 } 111 112 var provider dataProvider 113 if canStoreInRam && len(c.dataProviders) == 0 { 114 c.buf.Sort() 115 provider = KeepInRAM(c.buf) 116 c.allFlushed = true 117 } else { 118 fullBuf := c.buf 119 prevLen, prevSize := fullBuf.Len(), fullBuf.SizeLimit() 120 c.buf = getBufferByType(c.bufType, datasize.ByteSize(c.buf.SizeLimit())) 121 122 doFsync := !c.autoClean /* is critical collector */ 123 var err error 124 provider, err = FlushToDisk(c.logPrefix, fullBuf, c.tmpdir, doFsync, c.logLvl) 125 if err != nil { 126 return err 127 } 128 c.buf.Prealloc(prevLen/8, prevSize/8) 129 } 130 if provider != nil { 131 c.dataProviders = append(c.dataProviders, provider) 132 } 133 return nil 134 } 135 136 // Flush - an optional method (usually user don't need to call it) - forcing sort+flush current buffer. 137 // it does trigger background sort and flush, reducing RAM-holding, etc... 138 // it's useful when working with many collectors: to trigger background sort for all of them 139 func (c *Collector) Flush() error { 140 if !c.allFlushed { 141 if e := c.flushBuffer(false); e != nil { 142 return e 143 } 144 } 145 return nil 146 } 147 148 func (c *Collector) Load(db kv.RwTx, toBucket string, loadFunc LoadFunc, args TransformArgs) error { 149 if c.autoClean { 150 defer c.Close() 151 } 152 153 if !c.allFlushed { 154 if e := c.flushBuffer(true); e != nil { 155 return e 156 } 157 } 158 159 bucket := toBucket 160 161 var cursor kv.RwCursor 162 haveSortingGuaranties := isIdentityLoadFunc(loadFunc) // user-defined loadFunc may change ordering 163 var lastKey []byte 164 if bucket != "" { // passing empty bucket name is valid case for etl when DB modification is not expected 165 var err error 166 cursor, err = db.RwCursor(bucket) 167 if err != nil { 168 return err 169 } 170 var errLast error 171 lastKey, _, errLast = cursor.Last() 172 if errLast != nil { 173 return errLast 174 } 175 } 176 177 var canUseAppend bool 178 isDupSort := kv.ChaindataTablesCfg[bucket].Flags&kv.DupSort != 0 && !kv.ChaindataTablesCfg[bucket].AutoDupSortKeysConversion 179 180 logEvery := time.NewTicker(30 * time.Second) 181 defer logEvery.Stop() 182 183 i := 0 184 var prevK []byte 185 loadNextFunc := func(_, k, v []byte) error { 186 if i == 0 { 187 isEndOfBucket := lastKey == nil || bytes.Compare(lastKey, k) == -1 188 canUseAppend = haveSortingGuaranties && isEndOfBucket 189 } 190 i++ 191 192 // SortableOldestAppearedBuffer must guarantee that only 1 oldest value of key will appear 193 // but because size of buffer is limited - each flushed file does guarantee "oldest appeared" 194 // property, but files may overlap. files are sorted, just skip repeated keys here 195 if c.bufType == SortableOldestAppearedBuffer { 196 if bytes.Equal(prevK, k) { 197 return nil 198 } else { 199 // Need to copy k because the underlying space will be re-used for the next key 200 prevK = common.Copy(k) 201 } 202 } 203 204 select { 205 default: 206 case <-logEvery.C: 207 logArs := []interface{}{"into", bucket} 208 if args.LogDetailsLoad != nil { 209 logArs = append(logArs, args.LogDetailsLoad(k, v)...) 210 } else { 211 logArs = append(logArs, "current_prefix", makeCurrentKeyStr(k)) 212 } 213 214 c.logger.Log(c.logLvl, fmt.Sprintf("[%s] ETL [2/2] Loading", c.logPrefix), logArs...) 215 } 216 217 isNil := (c.bufType == SortableSliceBuffer && v == nil) || 218 (c.bufType == SortableAppendBuffer && len(v) == 0) || //backward compatibility 219 (c.bufType == SortableOldestAppearedBuffer && len(v) == 0) 220 if isNil { 221 if canUseAppend { 222 return nil // nothing to delete after end of bucket 223 } 224 if err := cursor.Delete(k); err != nil { 225 return err 226 } 227 return nil 228 } 229 if canUseAppend { 230 if isDupSort { 231 if err := cursor.(kv.RwCursorDupSort).AppendDup(k, v); err != nil { 232 return fmt.Errorf("%s: bucket: %s, appendDup: k=%x, %w", c.logPrefix, bucket, k, err) 233 } 234 } else { 235 if err := cursor.Append(k, v); err != nil { 236 return fmt.Errorf("%s: bucket: %s, append: k=%x, v=%x, %w", c.logPrefix, bucket, k, v, err) 237 } 238 } 239 240 return nil 241 } 242 if err := cursor.Put(k, v); err != nil { 243 return fmt.Errorf("%s: put: k=%x, %w", c.logPrefix, k, err) 244 } 245 return nil 246 } 247 248 currentTable := ¤tTableReader{db, bucket} 249 simpleLoad := func(k, v []byte) error { 250 return loadFunc(k, v, currentTable, loadNextFunc) 251 } 252 if err := mergeSortFiles(c.logPrefix, c.dataProviders, simpleLoad, args); err != nil { 253 return fmt.Errorf("loadIntoTable %s: %w", toBucket, err) 254 } 255 //logger.Trace(fmt.Sprintf("[%s] ETL Load done", c.logPrefix), "bucket", bucket, "records", i) 256 return nil 257 } 258 259 func (c *Collector) reset() { 260 if c.dataProviders != nil { 261 for _, p := range c.dataProviders { 262 p.Dispose() 263 } 264 c.dataProviders = nil 265 } 266 c.buf.Reset() 267 c.allFlushed = false 268 } 269 270 func (c *Collector) Close() { 271 c.reset() 272 } 273 274 // mergeSortFiles uses merge-sort to order the elements stored within the slice of providers, 275 // regardless of ordering within the files the elements will be processed in order. 276 // The first pass reads the first element from each of the providers and populates a heap with the key/value/provider index. 277 // Later, the heap is popped to get the first element, the record is processed using the LoadFunc, and the provider is asked 278 // for the next item, which is then added back to the heap. 279 // The subsequent iterations pop the heap again and load up the provider associated with it to get the next element after processing LoadFunc. 280 // this continues until all providers have reached their EOF. 281 func mergeSortFiles(logPrefix string, providers []dataProvider, loadFunc simpleLoadFunc, args TransformArgs) error { 282 for _, provider := range providers { 283 if err := provider.Wait(); err != nil { 284 return err 285 } 286 } 287 288 h := &Heap{} 289 heapInit(h) 290 for i, provider := range providers { 291 if key, value, err := provider.Next(nil, nil); err == nil { 292 heapPush(h, &HeapElem{key, value, i}) 293 } else /* we must have at least one entry per file */ { 294 eee := fmt.Errorf("%s: error reading first readers: n=%d current=%d provider=%s err=%w", 295 logPrefix, len(providers), i, provider, err) 296 panic(eee) 297 } 298 } 299 300 // Main loading loop 301 for h.Len() > 0 { 302 if err := common.Stopped(args.Quit); err != nil { 303 return err 304 } 305 306 element := heapPop(h) 307 provider := providers[element.TimeIdx] 308 err := loadFunc(element.Key, element.Value) 309 if err != nil { 310 return err 311 } 312 if element.Key, element.Value, err = provider.Next(element.Key[:0], element.Value[:0]); err == nil { 313 heapPush(h, element) 314 } else if !errors.Is(err, io.EOF) { 315 return fmt.Errorf("%s: error while reading next element from disk: %w", logPrefix, err) 316 } 317 } 318 return nil 319 } 320 321 func makeCurrentKeyStr(k []byte) string { 322 var currentKeyStr string 323 if k == nil { 324 currentKeyStr = "final" 325 } else if len(k) < 4 { 326 currentKeyStr = hex.EncodeToString(k) 327 } else if k[0] == 0 && k[1] == 0 && k[2] == 0 && k[3] == 0 && len(k) >= 8 { // if key has leading zeroes, show a bit more info 328 currentKeyStr = hex.EncodeToString(k) 329 } else { 330 currentKeyStr = hex.EncodeToString(k[:4]) 331 } 332 return currentKeyStr 333 }