github.com/ledgerwatch/erigon-lib@v1.0.0/etl/etl_test.go (about)

     1  /*
     2  Copyright 2021 Erigon contributors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  package etl
    17  
    18  import (
    19  	"bytes"
    20  	"encoding/hex"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  	"strings"
    26  	"testing"
    27  
    28  	"github.com/ledgerwatch/erigon-lib/kv"
    29  	"github.com/ledgerwatch/erigon-lib/kv/memdb"
    30  	"github.com/ledgerwatch/log/v3"
    31  	"github.com/stretchr/testify/assert"
    32  	"github.com/stretchr/testify/require"
    33  )
    34  
    35  func decodeHex(in string) []byte {
    36  	payload, err := hex.DecodeString(in)
    37  	if err != nil {
    38  		panic(err)
    39  	}
    40  	return payload
    41  }
    42  
    43  func TestEmptyValueIsNotANil(t *testing.T) {
    44  	logger := log.New()
    45  	t.Run("sortable", func(t *testing.T) {
    46  		collector := NewCollector(t.Name(), "", NewSortableBuffer(1), logger)
    47  		defer collector.Close()
    48  		require := require.New(t)
    49  		require.NoError(collector.Collect([]byte{1}, []byte{}))
    50  		require.NoError(collector.Collect([]byte{2}, nil))
    51  		require.NoError(collector.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
    52  			if k[0] == 1 {
    53  				require.Equal([]byte{}, v)
    54  			} else {
    55  				require.Nil(v)
    56  			}
    57  			return nil
    58  		}, TransformArgs{}))
    59  	})
    60  	t.Run("append", func(t *testing.T) {
    61  		// append buffer doesn't support nil values
    62  		collector := NewCollector(t.Name(), "", NewAppendBuffer(1), logger)
    63  		defer collector.Close()
    64  		require := require.New(t)
    65  		require.NoError(collector.Collect([]byte{1}, []byte{}))
    66  		require.NoError(collector.Collect([]byte{2}, nil))
    67  		require.NoError(collector.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
    68  			require.Nil(v)
    69  			return nil
    70  		}, TransformArgs{}))
    71  	})
    72  	t.Run("oldest", func(t *testing.T) {
    73  		collector := NewCollector(t.Name(), "", NewOldestEntryBuffer(1), logger)
    74  		defer collector.Close()
    75  		require := require.New(t)
    76  		require.NoError(collector.Collect([]byte{1}, []byte{}))
    77  		require.NoError(collector.Collect([]byte{2}, nil))
    78  		require.NoError(collector.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
    79  			if k[0] == 1 {
    80  				require.Equal([]byte{}, v)
    81  			} else {
    82  				require.Nil(v)
    83  			}
    84  			return nil
    85  		}, TransformArgs{}))
    86  	})
    87  }
    88  
    89  func TestEmptyKeyValue(t *testing.T) {
    90  	logger := log.New()
    91  	_, tx := memdb.NewTestTx(t)
    92  	require := require.New(t)
    93  	table := kv.ChaindataTables[0]
    94  	collector := NewCollector(t.Name(), "", NewSortableBuffer(1), logger)
    95  	defer collector.Close()
    96  	require.NoError(collector.Collect([]byte{2}, []byte{}))
    97  	require.NoError(collector.Collect([]byte{1}, []byte{1}))
    98  	require.NoError(collector.Load(tx, table, IdentityLoadFunc, TransformArgs{}))
    99  	v, err := tx.GetOne(table, []byte{2})
   100  	require.NoError(err)
   101  	require.Equal([]byte{}, v)
   102  	v, err = tx.GetOne(table, []byte{1})
   103  	require.NoError(err)
   104  	require.Equal([]byte{1}, v)
   105  
   106  	collector = NewCollector(t.Name(), "", NewSortableBuffer(1), logger)
   107  	defer collector.Close()
   108  	require.NoError(collector.Collect([]byte{}, nil))
   109  	require.NoError(collector.Load(tx, table, IdentityLoadFunc, TransformArgs{}))
   110  	v, err = tx.GetOne(table, []byte{})
   111  	require.NoError(err)
   112  	require.Nil(v)
   113  }
   114  
   115  func TestWriteAndReadBufferEntry(t *testing.T) {
   116  	b := NewSortableBuffer(128)
   117  	buffer := bytes.NewBuffer(make([]byte, 0))
   118  
   119  	entries := make([]sortableBufferEntry, 100)
   120  	for i := range entries {
   121  		entries[i].key = []byte(fmt.Sprintf("key-%d", i))
   122  		entries[i].value = []byte(fmt.Sprintf("value-%d", i))
   123  		b.Put(entries[i].key, entries[i].value)
   124  	}
   125  
   126  	if err := b.Write(buffer); err != nil {
   127  		t.Error(err)
   128  	}
   129  
   130  	bb := buffer.Bytes()
   131  
   132  	readBuffer := bytes.NewReader(bb)
   133  
   134  	for i := range entries {
   135  		k, v, err := readElementFromDisk(readBuffer, readBuffer, nil, nil)
   136  		if err != nil {
   137  			t.Error(err)
   138  		}
   139  		assert.Equal(t, string(entries[i].key), string(k))
   140  		assert.Equal(t, string(entries[i].value), string(v))
   141  	}
   142  
   143  	_, _, err := readElementFromDisk(readBuffer, readBuffer, nil, nil)
   144  	assert.Equal(t, io.EOF, err)
   145  }
   146  
   147  func TestNextKey(t *testing.T) {
   148  	for _, tc := range []string{
   149  		"00000001->00000002",
   150  		"000000FF->00000100",
   151  		"FEFFFFFF->FF000000",
   152  	} {
   153  		parts := strings.Split(tc, "->")
   154  		input := decodeHex(parts[0])
   155  		expectedOutput := decodeHex(parts[1])
   156  		actualOutput, err := NextKey(input)
   157  		assert.NoError(t, err)
   158  		assert.Equal(t, expectedOutput, actualOutput)
   159  	}
   160  }
   161  
   162  func TestNextKeyErr(t *testing.T) {
   163  	for _, tc := range []string{
   164  		"",
   165  		"FFFFFF",
   166  	} {
   167  		input := decodeHex(tc)
   168  		_, err := NextKey(input)
   169  		assert.Error(t, err)
   170  	}
   171  }
   172  
   173  func TestFileDataProviders(t *testing.T) {
   174  	logger := log.New()
   175  	// test invariant when we go through files (> 1 buffer)
   176  	_, tx := memdb.NewTestTx(t)
   177  	sourceBucket := kv.ChaindataTables[0]
   178  
   179  	generateTestData(t, tx, sourceBucket, 10)
   180  
   181  	collector := NewCollector(t.Name(), "", NewSortableBuffer(1), logger)
   182  
   183  	err := extractBucketIntoFiles("logPrefix", tx, sourceBucket, nil, nil, collector, testExtractToMapFunc, nil, nil, logger)
   184  	assert.NoError(t, err)
   185  
   186  	assert.Equal(t, 10, len(collector.dataProviders))
   187  
   188  	for _, p := range collector.dataProviders {
   189  		fp, ok := p.(*fileDataProvider)
   190  		assert.True(t, ok)
   191  		err := fp.Wait()
   192  		require.NoError(t, err)
   193  		_, err = os.Stat(fp.file.Name())
   194  		assert.NoError(t, err)
   195  	}
   196  
   197  	collector.Close()
   198  
   199  	for _, p := range collector.dataProviders {
   200  		fp, ok := p.(*fileDataProvider)
   201  		assert.True(t, ok)
   202  		_, err = os.Stat(fp.file.Name())
   203  		assert.True(t, os.IsNotExist(err))
   204  	}
   205  }
   206  
   207  func TestRAMDataProviders(t *testing.T) {
   208  	logger := log.New()
   209  	// test invariant when we go through memory (1 buffer)
   210  	_, tx := memdb.NewTestTx(t)
   211  	sourceBucket := kv.ChaindataTables[0]
   212  	generateTestData(t, tx, sourceBucket, 10)
   213  
   214  	collector := NewCollector(t.Name(), "", NewSortableBuffer(BufferOptimalSize), logger)
   215  	err := extractBucketIntoFiles("logPrefix", tx, sourceBucket, nil, nil, collector, testExtractToMapFunc, nil, nil, logger)
   216  	assert.NoError(t, err)
   217  
   218  	assert.Equal(t, 1, len(collector.dataProviders))
   219  
   220  	for _, p := range collector.dataProviders {
   221  		mp, ok := p.(*memoryDataProvider)
   222  		assert.True(t, ok)
   223  		assert.Equal(t, 10, mp.buffer.Len())
   224  	}
   225  }
   226  
   227  func TestTransformRAMOnly(t *testing.T) {
   228  	logger := log.New()
   229  	// test invariant when we only have one buffer and it fits into RAM (exactly 1 buffer)
   230  	_, tx := memdb.NewTestTx(t)
   231  
   232  	sourceBucket := kv.ChaindataTables[0]
   233  	destBucket := kv.ChaindataTables[1]
   234  	generateTestData(t, tx, sourceBucket, 20)
   235  	err := Transform(
   236  		"logPrefix",
   237  		tx,
   238  		sourceBucket,
   239  		destBucket,
   240  		"", // temp dir
   241  		testExtractToMapFunc,
   242  		testLoadFromMapFunc,
   243  		TransformArgs{},
   244  		logger,
   245  	)
   246  	assert.Nil(t, err)
   247  	compareBuckets(t, tx, sourceBucket, destBucket, nil)
   248  }
   249  
   250  func TestEmptySourceBucket(t *testing.T) {
   251  	logger := log.New()
   252  	_, tx := memdb.NewTestTx(t)
   253  	sourceBucket := kv.ChaindataTables[0]
   254  	destBucket := kv.ChaindataTables[1]
   255  	err := Transform(
   256  		"logPrefix",
   257  		tx,
   258  		sourceBucket,
   259  		destBucket,
   260  		"", // temp dir
   261  		testExtractToMapFunc,
   262  		testLoadFromMapFunc,
   263  		TransformArgs{},
   264  		logger,
   265  	)
   266  	assert.Nil(t, err)
   267  	compareBuckets(t, tx, sourceBucket, destBucket, nil)
   268  }
   269  
   270  func TestTransformExtractStartKey(t *testing.T) {
   271  	logger := log.New()
   272  	// test invariant when we only have one buffer and it fits into RAM (exactly 1 buffer)
   273  	_, tx := memdb.NewTestTx(t)
   274  	sourceBucket := kv.ChaindataTables[0]
   275  	destBucket := kv.ChaindataTables[1]
   276  	generateTestData(t, tx, sourceBucket, 10)
   277  	err := Transform(
   278  		"logPrefix",
   279  		tx,
   280  		sourceBucket,
   281  		destBucket,
   282  		"", // temp dir
   283  		testExtractToMapFunc,
   284  		testLoadFromMapFunc,
   285  		TransformArgs{ExtractStartKey: []byte(fmt.Sprintf("%10d-key-%010d", 5, 5))},
   286  		logger,
   287  	)
   288  	assert.Nil(t, err)
   289  	compareBuckets(t, tx, sourceBucket, destBucket, []byte(fmt.Sprintf("%10d-key-%010d", 5, 5)))
   290  }
   291  
   292  func TestTransformThroughFiles(t *testing.T) {
   293  	logger := log.New()
   294  	// test invariant when we go through files (> 1 buffer)
   295  	_, tx := memdb.NewTestTx(t)
   296  	sourceBucket := kv.ChaindataTables[0]
   297  	destBucket := kv.ChaindataTables[1]
   298  	generateTestData(t, tx, sourceBucket, 10)
   299  	err := Transform(
   300  		"logPrefix",
   301  		tx,
   302  		sourceBucket,
   303  		destBucket,
   304  		"", // temp dir
   305  		testExtractToMapFunc,
   306  		testLoadFromMapFunc,
   307  		TransformArgs{
   308  			BufferSize: 1,
   309  		},
   310  		logger,
   311  	)
   312  	assert.Nil(t, err)
   313  	compareBuckets(t, tx, sourceBucket, destBucket, nil)
   314  }
   315  
   316  func TestTransformDoubleOnExtract(t *testing.T) {
   317  	logger := log.New()
   318  	// test invariant when extractFunc multiplies the data 2x
   319  	_, tx := memdb.NewTestTx(t)
   320  	sourceBucket := kv.ChaindataTables[0]
   321  	destBucket := kv.ChaindataTables[1]
   322  	generateTestData(t, tx, sourceBucket, 10)
   323  	err := Transform(
   324  		"logPrefix",
   325  		tx,
   326  		sourceBucket,
   327  		destBucket,
   328  		"", // temp dir
   329  		testExtractDoubleToMapFunc,
   330  		testLoadFromMapFunc,
   331  		TransformArgs{},
   332  		logger,
   333  	)
   334  	assert.Nil(t, err)
   335  	compareBucketsDouble(t, tx, sourceBucket, destBucket)
   336  }
   337  
   338  func TestTransformDoubleOnLoad(t *testing.T) {
   339  	logger := log.New()
   340  	// test invariant when loadFunc multiplies the data 2x
   341  	_, tx := memdb.NewTestTx(t)
   342  	sourceBucket := kv.ChaindataTables[0]
   343  	destBucket := kv.ChaindataTables[1]
   344  	generateTestData(t, tx, sourceBucket, 10)
   345  	err := Transform(
   346  		"logPrefix",
   347  		tx,
   348  		sourceBucket,
   349  		destBucket,
   350  		"", // temp dir
   351  		testExtractToMapFunc,
   352  		testLoadFromMapDoubleFunc,
   353  		TransformArgs{},
   354  		logger,
   355  	)
   356  	assert.Nil(t, err)
   357  	compareBucketsDouble(t, tx, sourceBucket, destBucket)
   358  }
   359  
   360  func generateTestData(t *testing.T, db kv.Putter, bucket string, count int) {
   361  	t.Helper()
   362  	for i := 0; i < count; i++ {
   363  		k := []byte(fmt.Sprintf("%10d-key-%010d", i, i))
   364  		v := []byte(fmt.Sprintf("val-%099d", i))
   365  		err := db.Put(bucket, k, v)
   366  		assert.NoError(t, err)
   367  	}
   368  }
   369  
   370  func testExtractToMapFunc(k, v []byte, next ExtractNextFunc) error {
   371  	valueMap := make(map[string][]byte)
   372  	valueMap["value"] = v
   373  	out, err := json.Marshal(valueMap)
   374  	if err != nil {
   375  		return err
   376  	}
   377  	return next(k, k, out)
   378  }
   379  
   380  func testExtractDoubleToMapFunc(k, v []byte, next ExtractNextFunc) error {
   381  	var err error
   382  	valueMap := make(map[string][]byte)
   383  	valueMap["value"] = append(v, 0xAA)
   384  	k1 := append(k, 0xAA)
   385  	out, err := json.Marshal(valueMap)
   386  	if err != nil {
   387  		panic(err)
   388  	}
   389  
   390  	err = next(k, k1, out)
   391  	if err != nil {
   392  		return err
   393  	}
   394  
   395  	valueMap = make(map[string][]byte)
   396  	valueMap["value"] = append(v, 0xBB)
   397  	k2 := append(k, 0xBB)
   398  	out, err = json.Marshal(valueMap)
   399  	if err != nil {
   400  		panic(err)
   401  	}
   402  	return next(k, k2, out)
   403  }
   404  
   405  func testLoadFromMapFunc(k []byte, v []byte, _ CurrentTableReader, next LoadNextFunc) error {
   406  	valueMap := make(map[string][]byte)
   407  	err := json.Unmarshal(v, &valueMap)
   408  	if err != nil {
   409  		return err
   410  	}
   411  	realValue := valueMap["value"]
   412  	return next(k, k, realValue)
   413  }
   414  
   415  func testLoadFromMapDoubleFunc(k []byte, v []byte, _ CurrentTableReader, next LoadNextFunc) error {
   416  	valueMap := make(map[string][]byte)
   417  	err := json.Unmarshal(v, &valueMap)
   418  	if err != nil {
   419  		return err
   420  	}
   421  	realValue := valueMap["value"]
   422  
   423  	err = next(k, append(k, 0xAA), append(realValue, 0xAA))
   424  	if err != nil {
   425  		return err
   426  	}
   427  	return next(k, append(k, 0xBB), append(realValue, 0xBB))
   428  }
   429  
   430  func compareBuckets(t *testing.T, db kv.Tx, b1, b2 string, startKey []byte) {
   431  	t.Helper()
   432  	b1Map := make(map[string]string)
   433  	err := db.ForEach(b1, startKey, func(k, v []byte) error {
   434  		b1Map[fmt.Sprintf("%x", k)] = fmt.Sprintf("%x", v)
   435  		return nil
   436  	})
   437  	assert.NoError(t, err)
   438  	b2Map := make(map[string]string)
   439  	err = db.ForEach(b2, nil, func(k, v []byte) error {
   440  		b2Map[fmt.Sprintf("%x", k)] = fmt.Sprintf("%x", v)
   441  		return nil
   442  	})
   443  	assert.NoError(t, err)
   444  	assert.Equal(t, b1Map, b2Map)
   445  }
   446  
   447  func compareBucketsDouble(t *testing.T, db kv.Tx, b1, b2 string) {
   448  	t.Helper()
   449  	b1Map := make(map[string]string)
   450  	err := db.ForEach(b1, nil, func(k, v []byte) error {
   451  		b1Map[fmt.Sprintf("%x", append(k, 0xAA))] = fmt.Sprintf("%x", append(v, 0xAA))
   452  		b1Map[fmt.Sprintf("%x", append(k, 0xBB))] = fmt.Sprintf("%x", append(v, 0xBB))
   453  		return nil
   454  	})
   455  	assert.NoError(t, err)
   456  	b2Map := make(map[string]string)
   457  	err = db.ForEach(b2, nil, func(k, v []byte) error {
   458  		b2Map[fmt.Sprintf("%x", k)] = fmt.Sprintf("%x", v)
   459  		return nil
   460  	})
   461  	assert.NoError(t, err)
   462  	assert.Equal(t, b1Map, b2Map)
   463  }
   464  
   465  func TestReuseCollectorAfterLoad(t *testing.T) {
   466  	logger := log.New()
   467  	buf := NewSortableBuffer(128)
   468  	c := NewCollector("", t.TempDir(), buf, logger)
   469  
   470  	err := c.Collect([]byte{1}, []byte{2})
   471  	require.NoError(t, err)
   472  	see := 0
   473  	err = c.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
   474  		see++
   475  		return nil
   476  	}, TransformArgs{})
   477  	require.NoError(t, err)
   478  	require.Equal(t, 1, see)
   479  
   480  	// buffers are not lost
   481  	require.Zero(t, len(buf.data))
   482  	require.Zero(t, len(buf.lens))
   483  	require.Zero(t, len(buf.offsets))
   484  	require.NotZero(t, cap(buf.data))
   485  	require.NotZero(t, cap(buf.lens))
   486  	require.NotZero(t, cap(buf.offsets))
   487  
   488  	// teset that no data visible
   489  	see = 0
   490  	err = c.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
   491  		see++
   492  		return nil
   493  	}, TransformArgs{})
   494  	require.NoError(t, err)
   495  	require.Equal(t, 0, see)
   496  
   497  	// reuse
   498  	see = 0
   499  	err = c.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
   500  		see++
   501  		return nil
   502  	}, TransformArgs{})
   503  	require.NoError(t, err)
   504  	require.Equal(t, 0, see)
   505  
   506  	err = c.Collect([]byte{3}, []byte{4})
   507  	require.NoError(t, err)
   508  	see = 0
   509  	err = c.Load(nil, "", func(k, v []byte, table CurrentTableReader, next LoadNextFunc) error {
   510  		see++
   511  		return nil
   512  	}, TransformArgs{})
   513  	require.NoError(t, err)
   514  	require.Equal(t, 1, see)
   515  }