github.com/ledgerwatch/erigon-lib@v1.0.0/etl/dataprovider.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package etl
    18  
    19  import (
    20  	"bufio"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  
    26  	"github.com/ledgerwatch/log/v3"
    27  	"golang.org/x/sync/errgroup"
    28  )
    29  
    30  type dataProvider interface {
    31  	Next(keyBuf, valBuf []byte) ([]byte, []byte, error)
    32  	Dispose()    // Safe for repeated call, doesn't return error - means defer-friendly
    33  	Wait() error // join point for async providers
    34  }
    35  
    36  type fileDataProvider struct {
    37  	file       *os.File
    38  	reader     io.Reader
    39  	byteReader io.ByteReader // Different interface to the same object as reader
    40  	wg         *errgroup.Group
    41  }
    42  
    43  // FlushToDisk - `doFsync` is true only for 'critical' collectors (which should not loose).
    44  func FlushToDisk(logPrefix string, b Buffer, tmpdir string, doFsync bool, lvl log.Lvl) (dataProvider, error) {
    45  	if b.Len() == 0 {
    46  		return nil, nil
    47  	}
    48  
    49  	provider := &fileDataProvider{reader: nil, wg: &errgroup.Group{}}
    50  	provider.wg.Go(func() error {
    51  		b.Sort()
    52  
    53  		// if we are going to create files in the system temp dir, we don't need any
    54  		// subfolders.
    55  		if tmpdir != "" {
    56  			if err := os.MkdirAll(tmpdir, 0755); err != nil {
    57  				return err
    58  			}
    59  		}
    60  
    61  		bufferFile, err := os.CreateTemp(tmpdir, "erigon-sortable-buf-")
    62  		if err != nil {
    63  			return err
    64  		}
    65  		provider.file = bufferFile
    66  
    67  		if doFsync {
    68  			defer bufferFile.Sync() //nolint:errcheck
    69  		}
    70  
    71  		w := bufio.NewWriterSize(bufferFile, BufIOSize)
    72  		defer w.Flush() //nolint:errcheck
    73  
    74  		if err = b.Write(w); err != nil {
    75  			return fmt.Errorf("error writing entries to disk: %w", err)
    76  		}
    77  		log.Log(lvl, fmt.Sprintf("[%s] Flushed buffer file", logPrefix), "name", bufferFile.Name())
    78  		return nil
    79  	})
    80  
    81  	return provider, nil
    82  }
    83  
    84  func (p *fileDataProvider) Next(keyBuf, valBuf []byte) ([]byte, []byte, error) {
    85  	if p.reader == nil {
    86  		_, err := p.file.Seek(0, 0)
    87  		if err != nil {
    88  			return nil, nil, err
    89  		}
    90  		r := bufio.NewReaderSize(p.file, BufIOSize)
    91  		p.reader = r
    92  		p.byteReader = r
    93  
    94  	}
    95  	return readElementFromDisk(p.reader, p.byteReader, keyBuf, valBuf)
    96  }
    97  
    98  func (p *fileDataProvider) Wait() error { return p.wg.Wait() }
    99  func (p *fileDataProvider) Dispose() {
   100  	if p.file != nil { //invariant: safe to call multiple time
   101  		p.Wait()
   102  		_ = p.file.Close()
   103  		_ = os.Remove(p.file.Name())
   104  		p.file = nil
   105  	}
   106  }
   107  
   108  func (p *fileDataProvider) String() string {
   109  	return fmt.Sprintf("%T(file: %s)", p, p.file.Name())
   110  }
   111  
   112  func readElementFromDisk(r io.Reader, br io.ByteReader, keyBuf, valBuf []byte) ([]byte, []byte, error) {
   113  	n, err := binary.ReadVarint(br)
   114  	if err != nil {
   115  		return nil, nil, err
   116  	}
   117  	if n >= 0 {
   118  		// Reallocate the slice or extend it if there is enough capacity
   119  		if keyBuf == nil || len(keyBuf)+int(n) > cap(keyBuf) {
   120  			newKeyBuf := make([]byte, len(keyBuf)+int(n))
   121  			copy(newKeyBuf, keyBuf)
   122  			keyBuf = newKeyBuf
   123  		} else {
   124  			keyBuf = keyBuf[:len(keyBuf)+int(n)]
   125  		}
   126  		if _, err = io.ReadFull(r, keyBuf[len(keyBuf)-int(n):]); err != nil {
   127  			return nil, nil, err
   128  		}
   129  	} else {
   130  		keyBuf = nil
   131  	}
   132  	if n, err = binary.ReadVarint(br); err != nil {
   133  		return nil, nil, err
   134  	}
   135  	if n >= 0 {
   136  		// Reallocate the slice or extend it if there is enough capacity
   137  		if valBuf == nil || len(valBuf)+int(n) > cap(valBuf) {
   138  			newValBuf := make([]byte, len(valBuf)+int(n))
   139  			copy(newValBuf, valBuf)
   140  			valBuf = newValBuf
   141  		} else {
   142  			valBuf = valBuf[:len(valBuf)+int(n)]
   143  		}
   144  		if _, err = io.ReadFull(r, valBuf[len(valBuf)-int(n):]); err != nil {
   145  			return nil, nil, err
   146  		}
   147  	} else {
   148  		valBuf = nil
   149  	}
   150  	return keyBuf, valBuf, err
   151  }
   152  
   153  type memoryDataProvider struct {
   154  	buffer       Buffer
   155  	currentIndex int
   156  }
   157  
   158  func KeepInRAM(buffer Buffer) dataProvider {
   159  	return &memoryDataProvider{buffer, 0}
   160  }
   161  
   162  func (p *memoryDataProvider) Next(keyBuf, valBuf []byte) ([]byte, []byte, error) {
   163  	if p.currentIndex >= p.buffer.Len() {
   164  		return nil, nil, io.EOF
   165  	}
   166  	key, value := p.buffer.Get(p.currentIndex, keyBuf, valBuf)
   167  	p.currentIndex++
   168  	return key, value, nil
   169  }
   170  
   171  func (p *memoryDataProvider) Wait() error { return nil }
   172  func (p *memoryDataProvider) Dispose()    {}
   173  
   174  func (p *memoryDataProvider) String() string {
   175  	return fmt.Sprintf("%T(buffer.Len: %d)", p, p.buffer.Len())
   176  }