github.com/ledgerwatch/erigon-lib@v1.0.0/etl/buffers.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etl 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "io" 24 "sort" 25 "strconv" 26 27 "github.com/c2h5oh/datasize" 28 "github.com/ledgerwatch/erigon-lib/common" 29 ) 30 31 const ( 32 //SliceBuffer - just simple slice w 33 SortableSliceBuffer = iota 34 //SortableAppendBuffer - map[k] [v1 v2 v3] 35 SortableAppendBuffer 36 // SortableOldestAppearedBuffer - buffer that keeps only the oldest entries. 37 // if first v1 was added under key K, then v2; only v1 will stay 38 SortableOldestAppearedBuffer 39 40 //BufIOSize - 128 pages | default is 1 page | increasing over `64 * 4096` doesn't show speedup on SSD/NVMe, but show speedup in cloud drives 41 BufIOSize = 128 * 4096 42 ) 43 44 var BufferOptimalSize = 256 * datasize.MB /* var because we want to sometimes change it from tests or command-line flags */ 45 46 type Buffer interface { 47 Put(k, v []byte) 48 Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) 49 Len() int 50 Reset() 51 SizeLimit() int 52 Prealloc(predictKeysAmount, predictDataAmount int) 53 Write(io.Writer) error 54 Sort() 55 CheckFlushSize() bool 56 } 57 58 type sortableBufferEntry struct { 59 key []byte 60 value []byte 61 } 62 63 var ( 64 _ Buffer = &sortableBuffer{} 65 _ Buffer = &appendSortableBuffer{} 66 _ Buffer = &oldestEntrySortableBuffer{} 67 ) 68 69 func NewSortableBuffer(bufferOptimalSize datasize.ByteSize) *sortableBuffer { 70 return &sortableBuffer{ 71 optimalSize: int(bufferOptimalSize.Bytes()), 72 } 73 } 74 75 type sortableBuffer struct { 76 offsets []int 77 lens []int 78 data []byte 79 optimalSize int 80 } 81 82 // Put adds key and value to the buffer. These slices will not be accessed later, 83 // so no copying is necessary 84 func (b *sortableBuffer) Put(k, v []byte) { 85 lk, lv := len(k), len(v) 86 if k == nil { 87 lk = -1 88 } 89 if v == nil { 90 lv = -1 91 } 92 b.lens = append(b.lens, lk, lv) 93 94 b.offsets = append(b.offsets, len(b.data)) 95 b.data = append(b.data, k...) 96 b.offsets = append(b.offsets, len(b.data)) 97 b.data = append(b.data, v...) 98 } 99 100 func (b *sortableBuffer) Size() int { 101 return len(b.data) + 8*len(b.offsets) + 8*len(b.lens) 102 } 103 104 func (b *sortableBuffer) Len() int { 105 return len(b.offsets) / 2 106 } 107 108 func (b *sortableBuffer) Less(i, j int) bool { 109 i2, j2 := i*2, j*2 110 ki := b.data[b.offsets[i2] : b.offsets[i2]+b.lens[i2]] 111 kj := b.data[b.offsets[j2] : b.offsets[j2]+b.lens[j2]] 112 return bytes.Compare(ki, kj) < 0 113 } 114 115 func (b *sortableBuffer) Swap(i, j int) { 116 i2, j2 := i*2, j*2 117 b.offsets[i2], b.offsets[j2] = b.offsets[j2], b.offsets[i2] 118 b.offsets[i2+1], b.offsets[j2+1] = b.offsets[j2+1], b.offsets[i2+1] 119 b.lens[i2], b.lens[j2] = b.lens[j2], b.lens[i2] 120 b.lens[i2+1], b.lens[j2+1] = b.lens[j2+1], b.lens[i2+1] 121 } 122 123 func (b *sortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) { 124 i2 := i * 2 125 keyOffset, valOffset := b.offsets[i2], b.offsets[i2+1] 126 keyLen, valLen := b.lens[i2], b.lens[i2+1] 127 if keyLen > 0 { 128 keyBuf = append(keyBuf, b.data[keyOffset:keyOffset+keyLen]...) 129 } else if keyLen == 0 { 130 if keyBuf != nil { 131 keyBuf = keyBuf[:0] 132 } else { 133 keyBuf = []byte{} 134 } 135 } else { 136 keyBuf = nil 137 } 138 if valLen > 0 { 139 valBuf = append(valBuf, b.data[valOffset:valOffset+valLen]...) 140 } else if valLen == 0 { 141 if valBuf != nil { 142 valBuf = valBuf[:0] 143 } else { 144 valBuf = []byte{} 145 } 146 } else { 147 valBuf = nil 148 } 149 return keyBuf, valBuf 150 } 151 152 func (b *sortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) { 153 b.lens = make([]int, 0, predictKeysAmount) 154 b.offsets = make([]int, 0, predictKeysAmount) 155 b.data = make([]byte, 0, predictDataSize) 156 } 157 158 func (b *sortableBuffer) Reset() { 159 b.offsets = b.offsets[:0] 160 b.lens = b.lens[:0] 161 b.data = b.data[:0] 162 } 163 func (b *sortableBuffer) SizeLimit() int { return b.optimalSize } 164 func (b *sortableBuffer) Sort() { 165 if sort.IsSorted(b) { 166 return 167 } 168 sort.Stable(b) 169 } 170 171 func (b *sortableBuffer) CheckFlushSize() bool { 172 return b.Size() >= b.optimalSize 173 } 174 175 func (b *sortableBuffer) Write(w io.Writer) error { 176 var numBuf [binary.MaxVarintLen64]byte 177 for i, offset := range b.offsets { 178 l := b.lens[i] 179 n := binary.PutVarint(numBuf[:], int64(l)) 180 if _, err := w.Write(numBuf[:n]); err != nil { 181 return err 182 } 183 if l <= 0 { 184 continue 185 } 186 if _, err := w.Write(b.data[offset : offset+l]); err != nil { 187 return err 188 } 189 } 190 return nil 191 } 192 193 func NewAppendBuffer(bufferOptimalSize datasize.ByteSize) *appendSortableBuffer { 194 return &appendSortableBuffer{ 195 entries: make(map[string][]byte), 196 size: 0, 197 optimalSize: int(bufferOptimalSize.Bytes()), 198 } 199 } 200 201 type appendSortableBuffer struct { 202 entries map[string][]byte 203 sortedBuf []sortableBufferEntry 204 size int 205 optimalSize int 206 } 207 208 func (b *appendSortableBuffer) Put(k, v []byte) { 209 stored, ok := b.entries[string(k)] 210 if !ok { 211 b.size += len(k) 212 } 213 b.size += len(v) 214 stored = append(stored, v...) 215 b.entries[string(k)] = stored 216 } 217 218 func (b *appendSortableBuffer) Size() int { return b.size } 219 func (b *appendSortableBuffer) SizeLimit() int { return b.optimalSize } 220 221 func (b *appendSortableBuffer) Len() int { 222 return len(b.entries) 223 } 224 func (b *appendSortableBuffer) Sort() { 225 for i := range b.entries { 226 b.sortedBuf = append(b.sortedBuf, sortableBufferEntry{key: []byte(i), value: b.entries[i]}) 227 } 228 sort.Stable(b) 229 } 230 231 func (b *appendSortableBuffer) Less(i, j int) bool { 232 return bytes.Compare(b.sortedBuf[i].key, b.sortedBuf[j].key) < 0 233 } 234 235 func (b *appendSortableBuffer) Swap(i, j int) { 236 b.sortedBuf[i], b.sortedBuf[j] = b.sortedBuf[j], b.sortedBuf[i] 237 } 238 239 func (b *appendSortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) { 240 keyBuf = append(keyBuf, b.sortedBuf[i].key...) 241 valBuf = append(valBuf, b.sortedBuf[i].value...) 242 return keyBuf, valBuf 243 } 244 func (b *appendSortableBuffer) Reset() { 245 b.sortedBuf = nil 246 b.entries = make(map[string][]byte) 247 b.size = 0 248 } 249 func (b *appendSortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) { 250 b.entries = make(map[string][]byte, predictKeysAmount) 251 b.sortedBuf = make([]sortableBufferEntry, 0, predictKeysAmount*2) 252 } 253 254 func (b *appendSortableBuffer) Write(w io.Writer) error { 255 var numBuf [binary.MaxVarintLen64]byte 256 entries := b.sortedBuf 257 for _, entry := range entries { 258 lk := int64(len(entry.key)) 259 if entry.key == nil { 260 lk = -1 261 } 262 n := binary.PutVarint(numBuf[:], lk) 263 if _, err := w.Write(numBuf[:n]); err != nil { 264 return err 265 } 266 if _, err := w.Write(entry.key); err != nil { 267 return err 268 } 269 lv := int64(len(entry.key)) 270 if entry.value == nil { 271 lv = -1 272 } 273 n = binary.PutVarint(numBuf[:], lv) 274 if _, err := w.Write(numBuf[:n]); err != nil { 275 return err 276 } 277 if _, err := w.Write(entry.value); err != nil { 278 return err 279 } 280 } 281 return nil 282 } 283 284 func (b *appendSortableBuffer) CheckFlushSize() bool { 285 return b.size >= b.optimalSize 286 } 287 288 func NewOldestEntryBuffer(bufferOptimalSize datasize.ByteSize) *oldestEntrySortableBuffer { 289 return &oldestEntrySortableBuffer{ 290 entries: make(map[string][]byte), 291 size: 0, 292 optimalSize: int(bufferOptimalSize.Bytes()), 293 } 294 } 295 296 type oldestEntrySortableBuffer struct { 297 entries map[string][]byte 298 sortedBuf []sortableBufferEntry 299 size int 300 optimalSize int 301 } 302 303 func (b *oldestEntrySortableBuffer) Put(k, v []byte) { 304 _, ok := b.entries[string(k)] 305 if ok { 306 // if we already had this entry, we are going to keep it and ignore new value 307 return 308 } 309 310 b.size += len(k)*2 + len(v) 311 b.entries[string(k)] = common.Copy(v) 312 } 313 314 func (b *oldestEntrySortableBuffer) Size() int { return b.size } 315 func (b *oldestEntrySortableBuffer) SizeLimit() int { return b.optimalSize } 316 317 func (b *oldestEntrySortableBuffer) Len() int { 318 return len(b.entries) 319 } 320 321 func (b *oldestEntrySortableBuffer) Sort() { 322 for k, v := range b.entries { 323 b.sortedBuf = append(b.sortedBuf, sortableBufferEntry{key: []byte(k), value: v}) 324 } 325 sort.Stable(b) 326 } 327 328 func (b *oldestEntrySortableBuffer) Less(i, j int) bool { 329 return bytes.Compare(b.sortedBuf[i].key, b.sortedBuf[j].key) < 0 330 } 331 332 func (b *oldestEntrySortableBuffer) Swap(i, j int) { 333 b.sortedBuf[i], b.sortedBuf[j] = b.sortedBuf[j], b.sortedBuf[i] 334 } 335 336 func (b *oldestEntrySortableBuffer) Get(i int, keyBuf, valBuf []byte) ([]byte, []byte) { 337 keyBuf = append(keyBuf, b.sortedBuf[i].key...) 338 valBuf = append(valBuf, b.sortedBuf[i].value...) 339 return keyBuf, valBuf 340 } 341 func (b *oldestEntrySortableBuffer) Reset() { 342 b.sortedBuf = nil 343 b.entries = make(map[string][]byte) 344 b.size = 0 345 } 346 func (b *oldestEntrySortableBuffer) Prealloc(predictKeysAmount, predictDataSize int) { 347 b.entries = make(map[string][]byte, predictKeysAmount) 348 b.sortedBuf = make([]sortableBufferEntry, 0, predictKeysAmount*2) 349 } 350 351 func (b *oldestEntrySortableBuffer) Write(w io.Writer) error { 352 var numBuf [binary.MaxVarintLen64]byte 353 entries := b.sortedBuf 354 for _, entry := range entries { 355 lk := int64(len(entry.key)) 356 if entry.key == nil { 357 lk = -1 358 } 359 n := binary.PutVarint(numBuf[:], lk) 360 if _, err := w.Write(numBuf[:n]); err != nil { 361 return err 362 } 363 if _, err := w.Write(entry.key); err != nil { 364 return err 365 } 366 lv := int64(len(entry.value)) 367 if entry.value == nil { 368 lv = -1 369 } 370 n = binary.PutVarint(numBuf[:], lv) 371 if _, err := w.Write(numBuf[:n]); err != nil { 372 return err 373 } 374 if _, err := w.Write(entry.value); err != nil { 375 return err 376 } 377 } 378 return nil 379 } 380 func (b *oldestEntrySortableBuffer) CheckFlushSize() bool { 381 return b.size >= b.optimalSize 382 } 383 384 func getBufferByType(tp int, size datasize.ByteSize) Buffer { 385 switch tp { 386 case SortableSliceBuffer: 387 return NewSortableBuffer(size) 388 case SortableAppendBuffer: 389 return NewAppendBuffer(size) 390 case SortableOldestAppearedBuffer: 391 return NewOldestEntryBuffer(size) 392 default: 393 panic("unknown buffer type " + strconv.Itoa(tp)) 394 } 395 } 396 397 func getTypeByBuffer(b Buffer) int { 398 switch b.(type) { 399 case *sortableBuffer: 400 return SortableSliceBuffer 401 case *appendSortableBuffer: 402 return SortableAppendBuffer 403 case *oldestEntrySortableBuffer: 404 return SortableOldestAppearedBuffer 405 default: 406 panic(fmt.Sprintf("unknown buffer type: %T ", b)) 407 } 408 }