github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/arrowbatchconverter_test.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/arrowbatchconverter_test.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colserde_test
    12  
    13  import (
    14  	"bytes"
    15  	"fmt"
    16  	"testing"
    17  
    18  	"github.com/apache/arrow/go/arrow/array"
    19  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    20  	"github.com/cockroachdb/cockroach/pkg/col/coldatatestutils"
    21  	"github.com/cockroachdb/cockroach/pkg/col/colserde"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    25  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    26  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    27  	"github.com/stretchr/testify/require"
    28  )
    29  
    30  func randomBatch(allocator *colmem.Allocator) ([]*types.T, coldata.Batch) {
    31  	const maxTyps = 16
    32  	rng, _ := randutil.NewPseudoRand()
    33  
    34  	typs := make([]*types.T, rng.Intn(maxTyps)+1)
    35  	for i := range typs {
    36  		typs[i] = sqlbase.RandType(rng)
    37  	}
    38  
    39  	capacity := rng.Intn(coldata.BatchSize()) + 1
    40  	length := rng.Intn(capacity)
    41  	b := coldatatestutils.RandomBatch(allocator, rng, typs, capacity, length, rng.Float64())
    42  	return typs, b
    43  }
    44  
    45  func TestArrowBatchConverterRandom(t *testing.T) {
    46  	defer leaktest.AfterTest(t)()
    47  
    48  	typs, b := randomBatch(testAllocator)
    49  	c, err := colserde.NewArrowBatchConverter(typs)
    50  	require.NoError(t, err)
    51  
    52  	// Make a copy of the original batch because the converter modifies and casts
    53  	// data without copying for performance reasons.
    54  	expected := coldatatestutils.CopyBatch(b, typs, testColumnFactory)
    55  
    56  	arrowData, err := c.BatchToArrow(b)
    57  	require.NoError(t, err)
    58  	actual := testAllocator.NewMemBatchWithSize(typs, b.Length())
    59  	require.NoError(t, c.ArrowToBatch(arrowData, actual))
    60  
    61  	coldata.AssertEquivalentBatches(t, expected, actual)
    62  }
    63  
    64  // roundTripBatch is a helper function that round trips a batch through the
    65  // ArrowBatchConverter and RecordBatchSerializer and asserts that the output
    66  // batch is equal to the input batch. Make sure to copy the input batch before
    67  // passing it to this function to assert equality.
    68  func roundTripBatch(
    69  	b coldata.Batch,
    70  	c *colserde.ArrowBatchConverter,
    71  	r *colserde.RecordBatchSerializer,
    72  	typs []*types.T,
    73  ) (coldata.Batch, error) {
    74  	var buf bytes.Buffer
    75  	arrowDataIn, err := c.BatchToArrow(b)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  	_, _, err = r.Serialize(&buf, arrowDataIn)
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  
    84  	var arrowDataOut []*array.Data
    85  	if err := r.Deserialize(&arrowDataOut, buf.Bytes()); err != nil {
    86  		return nil, err
    87  	}
    88  	actual := testAllocator.NewMemBatchWithSize(typs, b.Length())
    89  	if err := c.ArrowToBatch(arrowDataOut, actual); err != nil {
    90  		return nil, err
    91  	}
    92  	return actual, nil
    93  }
    94  
    95  func TestRecordBatchRoundtripThroughBytes(t *testing.T) {
    96  	defer leaktest.AfterTest(t)()
    97  
    98  	for run := 0; run < 10; run++ {
    99  		typs, b := randomBatch(testAllocator)
   100  		c, err := colserde.NewArrowBatchConverter(typs)
   101  		require.NoError(t, err)
   102  		r, err := colserde.NewRecordBatchSerializer(typs)
   103  		require.NoError(t, err)
   104  
   105  		// Make a copy of the original batch because the converter modifies and
   106  		// casts data without copying for performance reasons.
   107  		expected := coldatatestutils.CopyBatch(b, typs, testColumnFactory)
   108  		actual, err := roundTripBatch(b, c, r, typs)
   109  		require.NoError(t, err)
   110  
   111  		coldata.AssertEquivalentBatches(t, expected, actual)
   112  	}
   113  }
   114  
   115  func BenchmarkArrowBatchConverter(b *testing.B) {
   116  	// fixedLen specifies how many bytes we should fit variable length data types
   117  	// to in order to reduce benchmark noise.
   118  	const fixedLen = 64
   119  
   120  	rng, _ := randutil.NewPseudoRand()
   121  
   122  	typs := []*types.T{
   123  		types.Bool,
   124  		types.Bytes,
   125  		types.Decimal,
   126  		types.Int,
   127  		types.Timestamp,
   128  	}
   129  	// numBytes corresponds 1:1 to typs and specifies how many bytes we are
   130  	// converting on one iteration of the benchmark for the corresponding type in
   131  	// typs.
   132  	numBytes := []int64{
   133  		int64(coldata.BatchSize()),
   134  		fixedLen * int64(coldata.BatchSize()),
   135  		0, // The number of bytes for decimals will be set below.
   136  		8 * int64(coldata.BatchSize()),
   137  		3 * 8 * int64(coldata.BatchSize()),
   138  	}
   139  	// Run a benchmark on every type we care about.
   140  	for typIdx, typ := range typs {
   141  		batch := coldatatestutils.RandomBatch(testAllocator, rng, []*types.T{typ}, coldata.BatchSize(), 0 /* length */, 0 /* nullProbability */)
   142  		if batch.Width() != 1 {
   143  			b.Fatalf("unexpected batch width: %d", batch.Width())
   144  		}
   145  		if typ.Identical(types.Bytes) {
   146  			// This type has variable length elements, fit all of them to be fixedLen
   147  			// bytes long so that we can compare results of one benchmark with
   148  			// another. Since we can't overwrite elements in a Bytes, create a new
   149  			// one.
   150  			// TODO(asubiotto): We should probably create some random spec struct that
   151  			//  we pass in to RandomBatch.
   152  			bytes := batch.ColVec(0).Bytes()
   153  			newBytes := coldata.NewBytes(bytes.Len())
   154  			for i := 0; i < bytes.Len(); i++ {
   155  				diff := len(bytes.Get(i)) - fixedLen
   156  				if diff < 0 {
   157  					newBytes.Set(i, append(bytes.Get(i), make([]byte, -diff)...))
   158  				} else if diff >= 0 {
   159  					newBytes.Set(i, bytes.Get(i)[:fixedLen])
   160  				}
   161  			}
   162  			batch.ColVec(0).SetCol(newBytes)
   163  		} else if typ.Identical(types.Decimal) {
   164  			// Decimal is variable length type, so we want to calculate precisely the
   165  			// total size of all decimals in the vector.
   166  			decimals := batch.ColVec(0).Decimal()
   167  			for _, d := range decimals {
   168  				marshaled, err := d.MarshalText()
   169  				require.NoError(b, err)
   170  				numBytes[typIdx] += int64(len(marshaled))
   171  			}
   172  		}
   173  		c, err := colserde.NewArrowBatchConverter([]*types.T{typ})
   174  		require.NoError(b, err)
   175  		nullFractions := []float64{0, 0.25, 0.5}
   176  		setNullFraction := func(batch coldata.Batch, nullFraction float64) {
   177  			vec := batch.ColVec(0)
   178  			vec.Nulls().UnsetNulls()
   179  			numNulls := int(nullFraction * float64(batch.Length()))
   180  			// Set the first numNulls elements to null.
   181  			for i := 0; i < batch.Length() && i < numNulls; i++ {
   182  				vec.Nulls().SetNull(i)
   183  			}
   184  		}
   185  		for _, nullFraction := range nullFractions {
   186  			setNullFraction(batch, nullFraction)
   187  			testPrefix := fmt.Sprintf("%s/nullFraction=%0.2f", typ.String(), nullFraction)
   188  			var data []*array.Data
   189  			b.Run(testPrefix+"/BatchToArrow", func(b *testing.B) {
   190  				b.SetBytes(numBytes[typIdx])
   191  				for i := 0; i < b.N; i++ {
   192  					data, _ = c.BatchToArrow(batch)
   193  					if len(data) != 1 {
   194  						b.Fatal("expected arrow batch of length 1")
   195  					}
   196  					if data[0].Len() != coldata.BatchSize() {
   197  						b.Fatal("unexpected number of elements")
   198  					}
   199  				}
   200  			})
   201  		}
   202  		for _, nullFraction := range nullFractions {
   203  			setNullFraction(batch, nullFraction)
   204  			data, err := c.BatchToArrow(batch)
   205  			require.NoError(b, err)
   206  			testPrefix := fmt.Sprintf("%s/nullFraction=%0.2f", typ.String(), nullFraction)
   207  			result := testAllocator.NewMemBatch([]*types.T{typ})
   208  			b.Run(testPrefix+"/ArrowToBatch", func(b *testing.B) {
   209  				b.SetBytes(numBytes[typIdx])
   210  				for i := 0; i < b.N; i++ {
   211  					// Using require.NoError here causes large enough allocations to
   212  					// affect the result.
   213  					if err := c.ArrowToBatch(data, result); err != nil {
   214  						b.Fatal(err)
   215  					}
   216  					if result.Width() != 1 {
   217  						b.Fatal("expected one column")
   218  					}
   219  					if result.Length() != coldata.BatchSize() {
   220  						b.Fatal("unexpected number of elements")
   221  					}
   222  				}
   223  			})
   224  		}
   225  	}
   226  }