github.com/ledgerwatch/erigon-lib@v1.0.0/etl/buffers.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package etl
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"io"
    24  	"sort"
    25  	"strconv"
    26  
    27  	"github.com/c2h5oh/datasize"
    28  	"github.com/ledgerwatch/erigon-lib/common"
    29  )
    30  
    31  const (
    32  	//SliceBuffer - just simple slice w
    33  	SortableSliceBuffer = iota
    34  	//SortableAppendBuffer - map[k] [v1 v2 v3]
    35  	SortableAppendBuffer
    36  	// SortableOldestAppearedBuffer - buffer that keeps only the oldest entries.
    37  	// if first v1 was added under key K, then v2; only v1 will stay
    38  	SortableOldestAppearedBuffer
    39  
    40  	//BufIOSize - 128 pages | default is 1 page | increasing over `64 * 4096` doesn't show speedup on SSD/NVMe, but show speedup in cloud drives
    41  	BufIOSize = 128 * 4096
    42  )
    43  
    44  var BufferOptimalSize = 256 * datasize.MB /*  var because we want to sometimes change it from tests or command-line flags */
    45  
    46  type Buffer interface {
    47  	Put(k, v []byte)
    48  	Get(i int, keyBuf, valBuf []byte) ([]byte, []byte)
    49  	Len() int
    50  	Reset()
    51  	SizeLimit() int
    52  	Prealloc(predictKeysAmount, predictDataAmount int)
    53  	Write(io.Writer) error
    54  	Sort()
    55  	CheckFlushSize() bool
    56  }
    57  
    58  type sortableBufferEntry struct {
    59  	key   []byte
    60  	value []byte
    61  }
    62  
    63  var (
    64  	_ Buffer = &sortableBuffer{}
    65  	_ Buffer = &appendSortableBuffer{}
    66  	_ Buffer = &oldestEntrySortableBuffer{}
    67  )
    68  
    69  func NewSortableBuffer(bufferOptimalSize datasize.ByteSize) *sortableBuffer {
    70  	return &sortableBuffer{
    71  		optimalSize: int(bufferOptimalSize.Bytes()),
    72  	}
    73  }
    74  
    75  type sortableBuffer struct {
    76  	offsets     []int
    77  	lens        []int
    78  	data        []byte
    79  	optimalSize int
    80  }
    81  
    82  // Put adds key and value to the buffer. These slices will not be accessed later,
    83  // so no copying is necessary
    84  func (b *sortableBuffer) Put(k, v []byte) {
    85  	lk, lv := len(k), len(v)
    86  	if k == nil {
    87  		lk = -1
    88  	}
    89  	if v == nil {
    90  		lv = -1
    91  	}
    92  	b.lens = append(b.lens, lk, lv)
    93  
    94  	b.offsets = append(b.offsets, len(b.data))
    95  	b.data = append(b.data, k...)
    96  	b.offsets = append(b.offsets, len(b.data))
    97  	b.data = append(b.data, v...)
    98  }
    99  
   100  func (b *sortableBuffer) Size() int {
   101  	return len(b.data) + 8*len(b.offsets) + 8*len(b.lens)
   102  }
   103  
   104  func (b *sortableBuffer) Len() int {
   105  	return len(b.offsets) / 2
   106  }
   107  
   108  func (b *sortableBuffer) Less(i, j int) bool {
   109  	i2, j2 := i*2, j*2
   110  	ki := b.data[b.offsets[i2] : b.offsets[i2]+b.lens[i2]]
   111  	kj := b.data[b.offsets[j2] : b.offsets[j2]+b.lens[j2]]
   112  	return bytes.Compare(ki, kj) < 0
   113  }
   114  
   115  func (b *sortableBuffer) Swap(i, j int) {
   116  	i2, j2 := i*2, j*2
   117  	b.offsets[i2], b.offsets[j2] = b.offsets[j2], b.offsets[i2]
   118  	b.offsets[i2+1], b.offsets[j2+1] = b.offsets[j2+1], b.offsets[i2+1]
   119  	b.lens[i2], b.lens[j2] = b.lens[j2], b.lens[i2]
   120  	b.lens[i2+1], b.lens[j2+1] = b.lens[j2+1], b.lens[i2+1]
   121  }
   122  
   123  func (b *sortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) {
   124  	i2 := i * 2
   125  	keyOffset, valOffset := b.offsets[i2], b.offsets[i2+1]
   126  	keyLen, valLen := b.lens[i2], b.lens[i2+1]
   127  	if keyLen > 0 {
   128  		keyBuf = append(keyBuf, b.data[keyOffset:keyOffset+keyLen]...)
   129  	} else if keyLen == 0 {
   130  		if keyBuf != nil {
   131  			keyBuf = keyBuf[:0]
   132  		} else {
   133  			keyBuf = []byte{}
   134  		}
   135  	} else {
   136  		keyBuf = nil
   137  	}
   138  	if valLen > 0 {
   139  		valBuf = append(valBuf, b.data[valOffset:valOffset+valLen]...)
   140  	} else if valLen == 0 {
   141  		if valBuf != nil {
   142  			valBuf = valBuf[:0]
   143  		} else {
   144  			valBuf = []byte{}
   145  		}
   146  	} else {
   147  		valBuf = nil
   148  	}
   149  	return keyBuf, valBuf
   150  }
   151  
   152  func (b *sortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) {
   153  	b.lens = make([]int, 0, predictKeysAmount)
   154  	b.offsets = make([]int, 0, predictKeysAmount)
   155  	b.data = make([]byte, 0, predictDataSize)
   156  }
   157  
   158  func (b *sortableBuffer) Reset() {
   159  	b.offsets = b.offsets[:0]
   160  	b.lens = b.lens[:0]
   161  	b.data = b.data[:0]
   162  }
   163  func (b *sortableBuffer) SizeLimit() int { return b.optimalSize }
   164  func (b *sortableBuffer) Sort() {
   165  	if sort.IsSorted(b) {
   166  		return
   167  	}
   168  	sort.Stable(b)
   169  }
   170  
   171  func (b *sortableBuffer) CheckFlushSize() bool {
   172  	return b.Size() >= b.optimalSize
   173  }
   174  
   175  func (b *sortableBuffer) Write(w io.Writer) error {
   176  	var numBuf [binary.MaxVarintLen64]byte
   177  	for i, offset := range b.offsets {
   178  		l := b.lens[i]
   179  		n := binary.PutVarint(numBuf[:], int64(l))
   180  		if _, err := w.Write(numBuf[:n]); err != nil {
   181  			return err
   182  		}
   183  		if l <= 0 {
   184  			continue
   185  		}
   186  		if _, err := w.Write(b.data[offset : offset+l]); err != nil {
   187  			return err
   188  		}
   189  	}
   190  	return nil
   191  }
   192  
   193  func NewAppendBuffer(bufferOptimalSize datasize.ByteSize) *appendSortableBuffer {
   194  	return &appendSortableBuffer{
   195  		entries:     make(map[string][]byte),
   196  		size:        0,
   197  		optimalSize: int(bufferOptimalSize.Bytes()),
   198  	}
   199  }
   200  
   201  type appendSortableBuffer struct {
   202  	entries     map[string][]byte
   203  	sortedBuf   []sortableBufferEntry
   204  	size        int
   205  	optimalSize int
   206  }
   207  
   208  func (b *appendSortableBuffer) Put(k, v []byte) {
   209  	stored, ok := b.entries[string(k)]
   210  	if !ok {
   211  		b.size += len(k)
   212  	}
   213  	b.size += len(v)
   214  	stored = append(stored, v...)
   215  	b.entries[string(k)] = stored
   216  }
   217  
   218  func (b *appendSortableBuffer) Size() int      { return b.size }
   219  func (b *appendSortableBuffer) SizeLimit() int { return b.optimalSize }
   220  
   221  func (b *appendSortableBuffer) Len() int {
   222  	return len(b.entries)
   223  }
   224  func (b *appendSortableBuffer) Sort() {
   225  	for i := range b.entries {
   226  		b.sortedBuf = append(b.sortedBuf, sortableBufferEntry{key: []byte(i), value: b.entries[i]})
   227  	}
   228  	sort.Stable(b)
   229  }
   230  
   231  func (b *appendSortableBuffer) Less(i, j int) bool {
   232  	return bytes.Compare(b.sortedBuf[i].key, b.sortedBuf[j].key) < 0
   233  }
   234  
   235  func (b *appendSortableBuffer) Swap(i, j int) {
   236  	b.sortedBuf[i], b.sortedBuf[j] = b.sortedBuf[j], b.sortedBuf[i]
   237  }
   238  
   239  func (b *appendSortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) {
   240  	keyBuf = append(keyBuf, b.sortedBuf[i].key...)
   241  	valBuf = append(valBuf, b.sortedBuf[i].value...)
   242  	return keyBuf, valBuf
   243  }
   244  func (b *appendSortableBuffer) Reset() {
   245  	b.sortedBuf = nil
   246  	b.entries = make(map[string][]byte)
   247  	b.size = 0
   248  }
   249  func (b *appendSortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) {
   250  	b.entries = make(map[string][]byte, predictKeysAmount)
   251  	b.sortedBuf = make([]sortableBufferEntry, 0, predictKeysAmount*2)
   252  }
   253  
   254  func (b *appendSortableBuffer) Write(w io.Writer) error {
   255  	var numBuf [binary.MaxVarintLen64]byte
   256  	entries := b.sortedBuf
   257  	for _, entry := range entries {
   258  		lk := int64(len(entry.key))
   259  		if entry.key == nil {
   260  			lk = -1
   261  		}
   262  		n := binary.PutVarint(numBuf[:], lk)
   263  		if _, err := w.Write(numBuf[:n]); err != nil {
   264  			return err
   265  		}
   266  		if _, err := w.Write(entry.key); err != nil {
   267  			return err
   268  		}
   269  		lv := int64(len(entry.key))
   270  		if entry.value == nil {
   271  			lv = -1
   272  		}
   273  		n = binary.PutVarint(numBuf[:], lv)
   274  		if _, err := w.Write(numBuf[:n]); err != nil {
   275  			return err
   276  		}
   277  		if _, err := w.Write(entry.value); err != nil {
   278  			return err
   279  		}
   280  	}
   281  	return nil
   282  }
   283  
   284  func (b *appendSortableBuffer) CheckFlushSize() bool {
   285  	return b.size >= b.optimalSize
   286  }
   287  
   288  func NewOldestEntryBuffer(bufferOptimalSize datasize.ByteSize) *oldestEntrySortableBuffer {
   289  	return &oldestEntrySortableBuffer{
   290  		entries:     make(map[string][]byte),
   291  		size:        0,
   292  		optimalSize: int(bufferOptimalSize.Bytes()),
   293  	}
   294  }
   295  
   296  type oldestEntrySortableBuffer struct {
   297  	entries     map[string][]byte
   298  	sortedBuf   []sortableBufferEntry
   299  	size        int
   300  	optimalSize int
   301  }
   302  
   303  func (b *oldestEntrySortableBuffer) Put(k, v []byte) {
   304  	_, ok := b.entries[string(k)]
   305  	if ok {
   306  		// if we already had this entry, we are going to keep it and ignore new value
   307  		return
   308  	}
   309  
   310  	b.size += len(k)*2 + len(v)
   311  	b.entries[string(k)] = common.Copy(v)
   312  }
   313  
   314  func (b *oldestEntrySortableBuffer) Size() int      { return b.size }
   315  func (b *oldestEntrySortableBuffer) SizeLimit() int { return b.optimalSize }
   316  
   317  func (b *oldestEntrySortableBuffer) Len() int {
   318  	return len(b.entries)
   319  }
   320  
   321  func (b *oldestEntrySortableBuffer) Sort() {
   322  	for k, v := range b.entries {
   323  		b.sortedBuf = append(b.sortedBuf, sortableBufferEntry{key: []byte(k), value: v})
   324  	}
   325  	sort.Stable(b)
   326  }
   327  
   328  func (b *oldestEntrySortableBuffer) Less(i, j int) bool {
   329  	return bytes.Compare(b.sortedBuf[i].key, b.sortedBuf[j].key) < 0
   330  }
   331  
   332  func (b *oldestEntrySortableBuffer) Swap(i, j int) {
   333  	b.sortedBuf[i], b.sortedBuf[j] = b.sortedBuf[j], b.sortedBuf[i]
   334  }
   335  
   336  func (b *oldestEntrySortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) {
   337  	keyBuf = append(keyBuf, b.sortedBuf[i].key...)
   338  	valBuf = append(valBuf, b.sortedBuf[i].value...)
   339  	return keyBuf, valBuf
   340  }
   341  func (b *oldestEntrySortableBuffer) Reset() {
   342  	b.sortedBuf = nil
   343  	b.entries = make(map[string][]byte)
   344  	b.size = 0
   345  }
   346  func (b *oldestEntrySortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) {
   347  	b.entries = make(map[string][]byte, predictKeysAmount)
   348  	b.sortedBuf = make([]sortableBufferEntry, 0, predictKeysAmount*2)
   349  }
   350  
   351  func (b *oldestEntrySortableBuffer) Write(w io.Writer) error {
   352  	var numBuf [binary.MaxVarintLen64]byte
   353  	entries := b.sortedBuf
   354  	for _, entry := range entries {
   355  		lk := int64(len(entry.key))
   356  		if entry.key == nil {
   357  			lk = -1
   358  		}
   359  		n := binary.PutVarint(numBuf[:], lk)
   360  		if _, err := w.Write(numBuf[:n]); err != nil {
   361  			return err
   362  		}
   363  		if _, err := w.Write(entry.key); err != nil {
   364  			return err
   365  		}
   366  		lv := int64(len(entry.value))
   367  		if entry.value == nil {
   368  			lv = -1
   369  		}
   370  		n = binary.PutVarint(numBuf[:], lv)
   371  		if _, err := w.Write(numBuf[:n]); err != nil {
   372  			return err
   373  		}
   374  		if _, err := w.Write(entry.value); err != nil {
   375  			return err
   376  		}
   377  	}
   378  	return nil
   379  }
   380  func (b *oldestEntrySortableBuffer) CheckFlushSize() bool {
   381  	return b.size >= b.optimalSize
   382  }
   383  
   384  func getBufferByType(tp int, size datasize.ByteSize) Buffer {
   385  	switch tp {
   386  	case SortableSliceBuffer:
   387  		return NewSortableBuffer(size)
   388  	case SortableAppendBuffer:
   389  		return NewAppendBuffer(size)
   390  	case SortableOldestAppearedBuffer:
   391  		return NewOldestEntryBuffer(size)
   392  	default:
   393  		panic("unknown buffer type " + strconv.Itoa(tp))
   394  	}
   395  }
   396  
   397  func getTypeByBuffer(b Buffer) int {
   398  	switch b.(type) {
   399  	case *sortableBuffer:
   400  		return SortableSliceBuffer
   401  	case *appendSortableBuffer:
   402  		return SortableAppendBuffer
   403  	case *oldestEntrySortableBuffer:
   404  		return SortableOldestAppearedBuffer
   405  	default:
   406  		panic(fmt.Sprintf("unknown buffer type: %T ", b))
   407  	}
   408  }