github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/record_batch_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colserde_test 12 13 import ( 14 "bytes" 15 "encoding/binary" 16 "fmt" 17 "math" 18 "math/rand" 19 "strings" 20 "testing" 21 "time" 22 "unsafe" 23 24 "github.com/apache/arrow/go/arrow" 25 "github.com/apache/arrow/go/arrow/array" 26 "github.com/apache/arrow/go/arrow/memory" 27 "github.com/cockroachdb/apd" 28 "github.com/cockroachdb/cockroach/pkg/col/colserde" 29 "github.com/cockroachdb/cockroach/pkg/col/typeconv" 30 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 31 "github.com/cockroachdb/cockroach/pkg/sql/types" 32 "github.com/cockroachdb/cockroach/pkg/testutils" 33 "github.com/cockroachdb/cockroach/pkg/util/encoding" 34 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 35 "github.com/cockroachdb/cockroach/pkg/util/randutil" 36 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 37 "github.com/stretchr/testify/require" 38 ) 39 40 // randomDataFromType creates an *array.Data of length n and type t, filling it 41 // with random values and inserting nulls with probability nullProbability. 42 func randomDataFromType(rng *rand.Rand, t *types.T, n int, nullProbability float64) *array.Data { 43 if nullProbability < 0 || nullProbability > 1 { 44 panic(fmt.Sprintf("expected a value between 0 and 1 for nullProbability but got %f", nullProbability)) 45 } 46 const ( 47 // maxVarLen is the maximum length we allow variable length datatypes (e.g. 48 // strings) to be. 49 maxVarLen = 1024 50 charset = "㪊㪋㪌㪍㪎𢽙啟敍敎敏敚敐救敒敓敔敕敖敗敘教敏敖abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ😈💜╯‵Д′)╯彡┻━┻" 51 ) 52 // valid represents the null bitmap. 53 valid := make([]bool, n) 54 for i := range valid { 55 if rng.Float64() >= nullProbability { 56 valid[i] = true 57 } 58 } 59 60 var builder array.Builder 61 switch typeconv.TypeFamilyToCanonicalTypeFamily(t.Family()) { 62 case types.BoolFamily: 63 builder = array.NewBooleanBuilder(memory.DefaultAllocator) 64 data := make([]bool, n) 65 for i := range data { 66 if rng.Float64() < 0.5 { 67 data[i] = true 68 } 69 } 70 builder.(*array.BooleanBuilder).AppendValues(data, valid) 71 case types.IntFamily: 72 switch t.Width() { 73 case 16: 74 builder = array.NewInt16Builder(memory.DefaultAllocator) 75 data := make([]int16, n) 76 for i := range data { 77 data[i] = int16(rng.Uint64()) 78 } 79 builder.(*array.Int16Builder).AppendValues(data, valid) 80 case 32: 81 builder = array.NewInt32Builder(memory.DefaultAllocator) 82 data := make([]int32, n) 83 for i := range data { 84 data[i] = int32(rng.Uint64()) 85 } 86 builder.(*array.Int32Builder).AppendValues(data, valid) 87 case 0, 64: 88 builder = array.NewInt64Builder(memory.DefaultAllocator) 89 data := make([]int64, n) 90 for i := range data { 91 data[i] = int64(rng.Uint64()) 92 } 93 builder.(*array.Int64Builder).AppendValues(data, valid) 94 default: 95 panic(fmt.Sprintf("unexpected int width: %d", t.Width())) 96 } 97 case types.FloatFamily: 98 builder = array.NewFloat64Builder(memory.DefaultAllocator) 99 data := make([]float64, n) 100 for i := range data { 101 data[i] = rng.Float64() * math.MaxFloat64 102 } 103 builder.(*array.Float64Builder).AppendValues(data, valid) 104 case types.BytesFamily: 105 // Bytes can be represented 3 different ways. As variable-length bytes, 106 // variable-length strings, or fixed-width bytes. 107 representation := rng.Intn(2) 108 switch representation { 109 case 0: 110 builder = array.NewStringBuilder(memory.DefaultAllocator) 111 data := make([]string, n) 112 stringBuilder := &strings.Builder{} 113 for i := range data { 114 stringBuilder.Reset() 115 if valid[i] { 116 for j := 0; j < rng.Intn(maxVarLen)+1; j++ { 117 stringBuilder.WriteRune(rune(charset[rng.Intn(len(charset))])) 118 } 119 } 120 data[i] = stringBuilder.String() 121 } 122 builder.(*array.StringBuilder).AppendValues(data, valid) 123 case 1: 124 builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) 125 data := make([][]byte, n) 126 for i := range data { 127 slice := make([]byte, rng.Intn(maxVarLen)) 128 if valid[i] { 129 // Read always returns len(slice) and nil error. 130 _, _ = rng.Read(slice) 131 } 132 data[i] = slice 133 } 134 builder.(*array.BinaryBuilder).AppendValues(data, valid) 135 case 2: 136 // NOTE: We currently do not generate fixed-width bytes in this test due to 137 // the different buffer layout (no offsets). The serialization code assumes 138 // 3 buffers for all types.BytesFamily types. 139 /* 140 width := rng.Intn(maxVarLen) + 1 141 builder = array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: width}) 142 data := make([][]byte, n) 143 for i := range data { 144 slice := make([]byte, width) 145 if valid[i] { 146 _, _ = rng.Read(slice) 147 } 148 data[i] = slice 149 } 150 builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid) 151 */ 152 } 153 case types.DecimalFamily: 154 var err error 155 builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) 156 data := make([][]byte, n) 157 for i := range data { 158 var d apd.Decimal 159 // int64(rng.Uint64()) to get negative numbers, too. 160 d.SetFinite(int64(rng.Uint64()), int32(rng.Intn(40)-20)) 161 data[i], err = d.MarshalText() 162 if err != nil { 163 panic(err) 164 } 165 } 166 builder.(*array.BinaryBuilder).AppendValues(data, valid) 167 case types.TimestampTZFamily: 168 var err error 169 now := timeutil.Now() 170 builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) 171 data := make([][]byte, n) 172 for i := range data { 173 delta := rng.Int63() 174 ts := now.Add(time.Duration(delta)) 175 data[i], err = ts.MarshalBinary() 176 if err != nil { 177 panic(err) 178 } 179 } 180 builder.(*array.BinaryBuilder).AppendValues(data, valid) 181 case types.IntervalFamily: 182 builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) 183 data := make([][]byte, n) 184 sizeOfInt64 := int(unsafe.Sizeof(int64(0))) 185 for i := range data { 186 data[i] = make([]byte, sizeOfInt64*3) 187 binary.LittleEndian.PutUint64(data[i][0:sizeOfInt64], rng.Uint64()) 188 binary.LittleEndian.PutUint64(data[i][sizeOfInt64:sizeOfInt64*2], rng.Uint64()) 189 binary.LittleEndian.PutUint64(data[i][sizeOfInt64*2:sizeOfInt64*3], rng.Uint64()) 190 } 191 builder.(*array.BinaryBuilder).AppendValues(data, valid) 192 case typeconv.DatumVecCanonicalTypeFamily: 193 builder = array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) 194 data := make([][]byte, n) 195 var ( 196 scratch []byte 197 err error 198 ) 199 for i := range data { 200 d := sqlbase.RandDatum(rng, t, false /* nullOk */) 201 data[i], err = sqlbase.EncodeTableValue(data[i], sqlbase.ColumnID(encoding.NoColumnID), d, scratch) 202 if err != nil { 203 panic(err) 204 } 205 } 206 builder.(*array.BinaryBuilder).AppendValues(data, valid) 207 default: 208 panic(fmt.Sprintf("unsupported type %s", t)) 209 } 210 return builder.NewArray().Data() 211 } 212 213 func TestRecordBatchSerializer(t *testing.T) { 214 defer leaktest.AfterTest(t)() 215 216 t.Run("UnsupportedSchema", func(t *testing.T) { 217 _, err := colserde.NewRecordBatchSerializer([]*types.T{}) 218 require.True(t, testutils.IsError(err, "zero length"), err) 219 }) 220 221 // Serializing and Deserializing an invalid schema is undefined. 222 223 t.Run("SerializeDifferentColumnLengths", func(t *testing.T) { 224 s, err := colserde.NewRecordBatchSerializer([]*types.T{types.Int, types.Int}) 225 require.NoError(t, err) 226 b := array.NewInt64Builder(memory.DefaultAllocator) 227 b.AppendValues([]int64{1, 2}, nil /* valid */) 228 firstCol := b.NewArray().Data() 229 b.AppendValues([]int64{3}, nil /* valid */) 230 secondCol := b.NewArray().Data() 231 _, _, err = s.Serialize(&bytes.Buffer{}, []*array.Data{firstCol, secondCol}) 232 require.True(t, testutils.IsError(err, "mismatched data lengths"), err) 233 }) 234 } 235 236 func TestRecordBatchSerializerSerializeDeserializeRandom(t *testing.T) { 237 defer leaktest.AfterTest(t)() 238 239 rng, _ := randutil.NewPseudoRand() 240 241 const ( 242 maxTypes = 16 243 maxDataLen = 2048 244 ) 245 246 var ( 247 typs = make([]*types.T, rng.Intn(maxTypes)+1) 248 data = make([]*array.Data, len(typs)) 249 dataLen = rng.Intn(maxDataLen) + 1 250 nullProbability = rng.Float64() 251 buf = bytes.Buffer{} 252 ) 253 254 for i := range typs { 255 typs[i] = sqlbase.RandType(rng) 256 data[i] = randomDataFromType(rng, typs[i], dataLen, nullProbability) 257 } 258 259 s, err := colserde.NewRecordBatchSerializer(typs) 260 if err != nil { 261 t.Fatal(err) 262 } 263 264 // Run Serialize/Deserialize in a loop to test reuse. 265 for i := 0; i < 2; i++ { 266 buf.Reset() 267 _, _, err := s.Serialize(&buf, data) 268 require.NoError(t, err) 269 if buf.Len()%8 != 0 { 270 t.Fatal("message length must align to 8 byte boundary") 271 } 272 var deserializedData []*array.Data 273 require.NoError(t, s.Deserialize(&deserializedData, buf.Bytes())) 274 275 // Check the fields we care most about. We can't use require.Equal directly 276 // due to some unimportant differences (e.g. mutability of underlying 277 // buffers). 278 require.Equal(t, len(data), len(deserializedData)) 279 for i := range data { 280 require.Equal(t, data[i].Len(), deserializedData[i].Len()) 281 require.Equal(t, len(data[i].Buffers()), len(deserializedData[i].Buffers())) 282 require.Equal(t, data[i].NullN(), deserializedData[i].NullN()) 283 require.Equal(t, data[i].Offset(), deserializedData[i].Offset()) 284 decBuffers := deserializedData[i].Buffers() 285 for j, buf := range data[i].Buffers() { 286 if buf == nil { 287 if decBuffers[j].Len() != 0 { 288 t.Fatal("expected zero length serialization of nil buffer") 289 } 290 continue 291 } 292 require.Equal(t, buf.Len(), decBuffers[j].Len()) 293 require.Equal(t, buf.Bytes(), decBuffers[j].Bytes()) 294 } 295 } 296 } 297 } 298 299 func BenchmarkRecordBatchSerializerInt64(b *testing.B) { 300 rng, _ := randutil.NewPseudoRand() 301 302 var ( 303 typs = []*types.T{types.Int} 304 buf = bytes.Buffer{} 305 deserializedData []*array.Data 306 ) 307 308 s, err := colserde.NewRecordBatchSerializer(typs) 309 require.NoError(b, err) 310 311 for _, dataLen := range []int{1, 16, 256, 2048, 4096} { 312 // Only calculate useful bytes. 313 numBytes := int64(dataLen * 8) 314 data := []*array.Data{randomDataFromType(rng, typs[0], dataLen, 0 /* nullProbability */)} 315 b.Run(fmt.Sprintf("Serialize/dataLen=%d", dataLen), func(b *testing.B) { 316 b.SetBytes(numBytes) 317 for i := 0; i < b.N; i++ { 318 buf.Reset() 319 if _, _, err := s.Serialize(&buf, data); err != nil { 320 b.Fatal(err) 321 } 322 } 323 }) 324 325 // buf should still have the result of the last serialization. It is still 326 // empty in cases in which we run only the Deserialize benchmarks. 327 if buf.Len() == 0 { 328 if _, _, err := s.Serialize(&buf, data); err != nil { 329 b.Fatal(err) 330 } 331 } 332 333 b.Run(fmt.Sprintf("Deserialize/dataLen=%d", dataLen), func(b *testing.B) { 334 b.SetBytes(numBytes) 335 for i := 0; i < b.N; i++ { 336 if err := s.Deserialize(&deserializedData, buf.Bytes()); err != nil { 337 b.Fatal(err) 338 } 339 deserializedData = deserializedData[:0] 340 } 341 }) 342 } 343 }