github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/record_batch_test.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colserde_test
    12  
    13  import (
    14  	"bytes"
    15  	"encoding/binary"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"strings"
    20  	"testing"
    21  	"time"
    22  	"unsafe"
    23  
    24  	"github.com/apache/arrow/go/arrow"
    25  	"github.com/apache/arrow/go/arrow/array"
    26  	"github.com/apache/arrow/go/arrow/memory"
    27  	"github.com/cockroachdb/apd"
    28  	"github.com/cockroachdb/cockroach/pkg/col/colserde"
    29  	"github.com/cockroachdb/cockroach/pkg/col/typeconv"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    32  	"github.com/cockroachdb/cockroach/pkg/testutils"
    33  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    34  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    35  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    37  	"github.com/stretchr/testify/require"
    38  )
    39  
    40  // randomDataFromType creates an *array.Data of length n and type t, filling it
    41  // with random values and inserting nulls with probability nullProbability.
    42  func randomDataFromType(rng *rand.Rand, t *types.T, n int, nullProbability float64) *array.Data {
    43  	if nullProbability < 0 || nullProbability > 1 {
    44  		panic(fmt.Sprintf("expected a value between 0 and 1 for nullProbability but got %f", nullProbability))
    45  	}
    46  	const (
    47  		// maxVarLen is the maximum length we allow variable length datatypes (e.g.
    48  		// strings) to be.
    49  		maxVarLen = 1024
    50  		charset   = "㪊㪋㪌㪍㪎𢽙啟敍敎敏敚敐救敒敓敔敕敖敗敘教敏敖abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ😈💜╯‵Д′)╯彡┻━┻"
    51  	)
    52  	// valid represents the null bitmap.
    53  	valid := make([]bool, n)
    54  	for i := range valid {
    55  		if rng.Float64() >= nullProbability {
    56  			valid[i] = true
    57  		}
    58  	}
    59  
    60  	var builder array.Builder
    61  	switch typeconv.TypeFamilyToCanonicalTypeFamily(t.Family()) {
    62  	case types.BoolFamily:
    63  		builder = array.NewBooleanBuilder(memory.DefaultAllocator)
    64  		data := make([]bool, n)
    65  		for i := range data {
    66  			if rng.Float64() < 0.5 {
    67  				data[i] = true
    68  			}
    69  		}
    70  		builder.(*array.BooleanBuilder).AppendValues(data, valid)
    71  	case types.IntFamily:
    72  		switch t.Width() {
    73  		case 16:
    74  			builder = array.NewInt16Builder(memory.DefaultAllocator)
    75  			data := make([]int16, n)
    76  			for i := range data {
    77  				data[i] = int16(rng.Uint64())
    78  			}
    79  			builder.(*array.Int16Builder).AppendValues(data, valid)
    80  		case 32:
    81  			builder = array.NewInt32Builder(memory.DefaultAllocator)
    82  			data := make([]int32, n)
    83  			for i := range data {
    84  				data[i] = int32(rng.Uint64())
    85  			}
    86  			builder.(*array.Int32Builder).AppendValues(data, valid)
    87  		case 0, 64:
    88  			builder = array.NewInt64Builder(memory.DefaultAllocator)
    89  			data := make([]int64, n)
    90  			for i := range data {
    91  				data[i] = int64(rng.Uint64())
    92  			}
    93  			builder.(*array.Int64Builder).AppendValues(data, valid)
    94  		default:
    95  			panic(fmt.Sprintf("unexpected int width: %d", t.Width()))
    96  		}
    97  	case types.FloatFamily:
    98  		builder = array.NewFloat64Builder(memory.DefaultAllocator)
    99  		data := make([]float64, n)
   100  		for i := range data {
   101  			data[i] = rng.Float64() * math.MaxFloat64
   102  		}
   103  		builder.(*array.Float64Builder).AppendValues(data, valid)
   104  	case types.BytesFamily:
   105  		// Bytes can be represented 3 different ways. As variable-length bytes,
   106  		// variable-length strings, or fixed-width bytes.
   107  		representation := rng.Intn(2)
   108  		switch representation {
   109  		case 0:
   110  			builder = array.NewStringBuilder(memory.DefaultAllocator)
   111  			data := make([]string, n)
   112  			stringBuilder := &strings.Builder{}
   113  			for i := range data {
   114  				stringBuilder.Reset()
   115  				if valid[i] {
   116  					for j := 0; j < rng.Intn(maxVarLen)+1; j++ {
   117  						stringBuilder.WriteRune(rune(charset[rng.Intn(len(charset))]))
   118  					}
   119  				}
   120  				data[i] = stringBuilder.String()
   121  			}
   122  			builder.(*array.StringBuilder).AppendValues(data, valid)
   123  		case 1:
   124  			builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
   125  			data := make([][]byte, n)
   126  			for i := range data {
   127  				slice := make([]byte, rng.Intn(maxVarLen))
   128  				if valid[i] {
   129  					// Read always returns len(slice) and nil error.
   130  					_, _ = rng.Read(slice)
   131  				}
   132  				data[i] = slice
   133  			}
   134  			builder.(*array.BinaryBuilder).AppendValues(data, valid)
   135  		case 2:
   136  			// NOTE: We currently do not generate fixed-width bytes in this test due to
   137  			// the different buffer layout (no offsets). The serialization code assumes
   138  			// 3 buffers for all types.BytesFamily types.
   139  			/*
   140  				width := rng.Intn(maxVarLen) + 1
   141  				  builder = array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: width})
   142  				  data := make([][]byte, n)
   143  				  for i := range data {
   144  				  	slice := make([]byte, width)
   145  				  	if valid[i] {
   146  				  		_, _ = rng.Read(slice)
   147  				  	}
   148  				  	data[i] = slice
   149  				  }
   150  				  builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid)
   151  			*/
   152  		}
   153  	case types.DecimalFamily:
   154  		var err error
   155  		builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
   156  		data := make([][]byte, n)
   157  		for i := range data {
   158  			var d apd.Decimal
   159  			// int64(rng.Uint64()) to get negative numbers, too.
   160  			d.SetFinite(int64(rng.Uint64()), int32(rng.Intn(40)-20))
   161  			data[i], err = d.MarshalText()
   162  			if err != nil {
   163  				panic(err)
   164  			}
   165  		}
   166  		builder.(*array.BinaryBuilder).AppendValues(data, valid)
   167  	case types.TimestampTZFamily:
   168  		var err error
   169  		now := timeutil.Now()
   170  		builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
   171  		data := make([][]byte, n)
   172  		for i := range data {
   173  			delta := rng.Int63()
   174  			ts := now.Add(time.Duration(delta))
   175  			data[i], err = ts.MarshalBinary()
   176  			if err != nil {
   177  				panic(err)
   178  			}
   179  		}
   180  		builder.(*array.BinaryBuilder).AppendValues(data, valid)
   181  	case types.IntervalFamily:
   182  		builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
   183  		data := make([][]byte, n)
   184  		sizeOfInt64 := int(unsafe.Sizeof(int64(0)))
   185  		for i := range data {
   186  			data[i] = make([]byte, sizeOfInt64*3)
   187  			binary.LittleEndian.PutUint64(data[i][0:sizeOfInt64], rng.Uint64())
   188  			binary.LittleEndian.PutUint64(data[i][sizeOfInt64:sizeOfInt64*2], rng.Uint64())
   189  			binary.LittleEndian.PutUint64(data[i][sizeOfInt64*2:sizeOfInt64*3], rng.Uint64())
   190  		}
   191  		builder.(*array.BinaryBuilder).AppendValues(data, valid)
   192  	case typeconv.DatumVecCanonicalTypeFamily:
   193  		builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)
   194  		data := make([][]byte, n)
   195  		var (
   196  			scratch []byte
   197  			err     error
   198  		)
   199  		for i := range data {
   200  			d := sqlbase.RandDatum(rng, t, false /* nullOk */)
   201  			data[i], err = sqlbase.EncodeTableValue(data[i], sqlbase.ColumnID(encoding.NoColumnID), d, scratch)
   202  			if err != nil {
   203  				panic(err)
   204  			}
   205  		}
   206  		builder.(*array.BinaryBuilder).AppendValues(data, valid)
   207  	default:
   208  		panic(fmt.Sprintf("unsupported type %s", t))
   209  	}
   210  	return builder.NewArray().Data()
   211  }
   212  
   213  func TestRecordBatchSerializer(t *testing.T) {
   214  	defer leaktest.AfterTest(t)()
   215  
   216  	t.Run("UnsupportedSchema", func(t *testing.T) {
   217  		_, err := colserde.NewRecordBatchSerializer([]*types.T{})
   218  		require.True(t, testutils.IsError(err, "zero length"), err)
   219  	})
   220  
   221  	// Serializing and Deserializing an invalid schema is undefined.
   222  
   223  	t.Run("SerializeDifferentColumnLengths", func(t *testing.T) {
   224  		s, err := colserde.NewRecordBatchSerializer([]*types.T{types.Int, types.Int})
   225  		require.NoError(t, err)
   226  		b := array.NewInt64Builder(memory.DefaultAllocator)
   227  		b.AppendValues([]int64{1, 2}, nil /* valid */)
   228  		firstCol := b.NewArray().Data()
   229  		b.AppendValues([]int64{3}, nil /* valid */)
   230  		secondCol := b.NewArray().Data()
   231  		_, _, err = s.Serialize(&bytes.Buffer{}, []*array.Data{firstCol, secondCol})
   232  		require.True(t, testutils.IsError(err, "mismatched data lengths"), err)
   233  	})
   234  }
   235  
   236  func TestRecordBatchSerializerSerializeDeserializeRandom(t *testing.T) {
   237  	defer leaktest.AfterTest(t)()
   238  
   239  	rng, _ := randutil.NewPseudoRand()
   240  
   241  	const (
   242  		maxTypes   = 16
   243  		maxDataLen = 2048
   244  	)
   245  
   246  	var (
   247  		typs            = make([]*types.T, rng.Intn(maxTypes)+1)
   248  		data            = make([]*array.Data, len(typs))
   249  		dataLen         = rng.Intn(maxDataLen) + 1
   250  		nullProbability = rng.Float64()
   251  		buf             = bytes.Buffer{}
   252  	)
   253  
   254  	for i := range typs {
   255  		typs[i] = sqlbase.RandType(rng)
   256  		data[i] = randomDataFromType(rng, typs[i], dataLen, nullProbability)
   257  	}
   258  
   259  	s, err := colserde.NewRecordBatchSerializer(typs)
   260  	if err != nil {
   261  		t.Fatal(err)
   262  	}
   263  
   264  	// Run Serialize/Deserialize in a loop to test reuse.
   265  	for i := 0; i < 2; i++ {
   266  		buf.Reset()
   267  		_, _, err := s.Serialize(&buf, data)
   268  		require.NoError(t, err)
   269  		if buf.Len()%8 != 0 {
   270  			t.Fatal("message length must align to 8 byte boundary")
   271  		}
   272  		var deserializedData []*array.Data
   273  		require.NoError(t, s.Deserialize(&deserializedData, buf.Bytes()))
   274  
   275  		// Check the fields we care most about. We can't use require.Equal directly
   276  		// due to some unimportant differences (e.g. mutability of underlying
   277  		// buffers).
   278  		require.Equal(t, len(data), len(deserializedData))
   279  		for i := range data {
   280  			require.Equal(t, data[i].Len(), deserializedData[i].Len())
   281  			require.Equal(t, len(data[i].Buffers()), len(deserializedData[i].Buffers()))
   282  			require.Equal(t, data[i].NullN(), deserializedData[i].NullN())
   283  			require.Equal(t, data[i].Offset(), deserializedData[i].Offset())
   284  			decBuffers := deserializedData[i].Buffers()
   285  			for j, buf := range data[i].Buffers() {
   286  				if buf == nil {
   287  					if decBuffers[j].Len() != 0 {
   288  						t.Fatal("expected zero length serialization of nil buffer")
   289  					}
   290  					continue
   291  				}
   292  				require.Equal(t, buf.Len(), decBuffers[j].Len())
   293  				require.Equal(t, buf.Bytes(), decBuffers[j].Bytes())
   294  			}
   295  		}
   296  	}
   297  }
   298  
   299  func BenchmarkRecordBatchSerializerInt64(b *testing.B) {
   300  	rng, _ := randutil.NewPseudoRand()
   301  
   302  	var (
   303  		typs             = []*types.T{types.Int}
   304  		buf              = bytes.Buffer{}
   305  		deserializedData []*array.Data
   306  	)
   307  
   308  	s, err := colserde.NewRecordBatchSerializer(typs)
   309  	require.NoError(b, err)
   310  
   311  	for _, dataLen := range []int{1, 16, 256, 2048, 4096} {
   312  		// Only calculate useful bytes.
   313  		numBytes := int64(dataLen * 8)
   314  		data := []*array.Data{randomDataFromType(rng, typs[0], dataLen, 0 /* nullProbability */)}
   315  		b.Run(fmt.Sprintf("Serialize/dataLen=%d", dataLen), func(b *testing.B) {
   316  			b.SetBytes(numBytes)
   317  			for i := 0; i < b.N; i++ {
   318  				buf.Reset()
   319  				if _, _, err := s.Serialize(&buf, data); err != nil {
   320  					b.Fatal(err)
   321  				}
   322  			}
   323  		})
   324  
   325  		// buf should still have the result of the last serialization. It is still
   326  		// empty in cases in which we run only the Deserialize benchmarks.
   327  		if buf.Len() == 0 {
   328  			if _, _, err := s.Serialize(&buf, data); err != nil {
   329  				b.Fatal(err)
   330  			}
   331  		}
   332  
   333  		b.Run(fmt.Sprintf("Deserialize/dataLen=%d", dataLen), func(b *testing.B) {
   334  			b.SetBytes(numBytes)
   335  			for i := 0; i < b.N; i++ {
   336  				if err := s.Deserialize(&deserializedData, buf.Bytes()); err != nil {
   337  					b.Fatal(err)
   338  				}
   339  				deserializedData = deserializedData[:0]
   340  			}
   341  		})
   342  	}
   343  }