github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/arrowbatchconverter_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colserde_test 12 13 import ( 14 "bytes" 15 "fmt" 16 "testing" 17 18 "github.com/apache/arrow/go/arrow/array" 19 "github.com/cockroachdb/cockroach/pkg/col/coldata" 20 "github.com/cockroachdb/cockroach/pkg/col/coldatatestutils" 21 "github.com/cockroachdb/cockroach/pkg/col/colserde" 22 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 23 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 24 "github.com/cockroachdb/cockroach/pkg/sql/types" 25 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 26 "github.com/cockroachdb/cockroach/pkg/util/randutil" 27 "github.com/stretchr/testify/require" 28 ) 29 30 func randomBatch(allocator *colmem.Allocator) ([]*types.T, coldata.Batch) { 31 const maxTyps = 16 32 rng, _ := randutil.NewPseudoRand() 33 34 typs := make([]*types.T, rng.Intn(maxTyps)+1) 35 for i := range typs { 36 typs[i] = sqlbase.RandType(rng) 37 } 38 39 capacity := rng.Intn(coldata.BatchSize()) + 1 40 length := rng.Intn(capacity) 41 b := coldatatestutils.RandomBatch(allocator, rng, typs, capacity, length, rng.Float64()) 42 return typs, b 43 } 44 45 func TestArrowBatchConverterRandom(t *testing.T) { 46 defer leaktest.AfterTest(t)() 47 48 typs, b := randomBatch(testAllocator) 49 c, err := colserde.NewArrowBatchConverter(typs) 50 require.NoError(t, err) 51 52 // Make a copy of the original batch because the converter modifies and casts 53 // data without copying for performance reasons. 54 expected := coldatatestutils.CopyBatch(b, typs, testColumnFactory) 55 56 arrowData, err := c.BatchToArrow(b) 57 require.NoError(t, err) 58 actual := testAllocator.NewMemBatchWithSize(typs, b.Length()) 59 require.NoError(t, c.ArrowToBatch(arrowData, actual)) 60 61 coldata.AssertEquivalentBatches(t, expected, actual) 62 } 63 64 // roundTripBatch is a helper function that round trips a batch through the 65 // ArrowBatchConverter and RecordBatchSerializer and asserts that the output 66 // batch is equal to the input batch. Make sure to copy the input batch before 67 // passing it to this function to assert equality. 68 func roundTripBatch( 69 b coldata.Batch, 70 c *colserde.ArrowBatchConverter, 71 r *colserde.RecordBatchSerializer, 72 typs []*types.T, 73 ) (coldata.Batch, error) { 74 var buf bytes.Buffer 75 arrowDataIn, err := c.BatchToArrow(b) 76 if err != nil { 77 return nil, err 78 } 79 _, _, err = r.Serialize(&buf, arrowDataIn) 80 if err != nil { 81 return nil, err 82 } 83 84 var arrowDataOut []*array.Data 85 if err := r.Deserialize(&arrowDataOut, buf.Bytes()); err != nil { 86 return nil, err 87 } 88 actual := testAllocator.NewMemBatchWithSize(typs, b.Length()) 89 if err := c.ArrowToBatch(arrowDataOut, actual); err != nil { 90 return nil, err 91 } 92 return actual, nil 93 } 94 95 func TestRecordBatchRoundtripThroughBytes(t *testing.T) { 96 defer leaktest.AfterTest(t)() 97 98 for run := 0; run < 10; run++ { 99 typs, b := randomBatch(testAllocator) 100 c, err := colserde.NewArrowBatchConverter(typs) 101 require.NoError(t, err) 102 r, err := colserde.NewRecordBatchSerializer(typs) 103 require.NoError(t, err) 104 105 // Make a copy of the original batch because the converter modifies and 106 // casts data without copying for performance reasons. 107 expected := coldatatestutils.CopyBatch(b, typs, testColumnFactory) 108 actual, err := roundTripBatch(b, c, r, typs) 109 require.NoError(t, err) 110 111 coldata.AssertEquivalentBatches(t, expected, actual) 112 } 113 } 114 115 func BenchmarkArrowBatchConverter(b *testing.B) { 116 // fixedLen specifies how many bytes we should fit variable length data types 117 // to in order to reduce benchmark noise. 118 const fixedLen = 64 119 120 rng, _ := randutil.NewPseudoRand() 121 122 typs := []*types.T{ 123 types.Bool, 124 types.Bytes, 125 types.Decimal, 126 types.Int, 127 types.Timestamp, 128 } 129 // numBytes corresponds 1:1 to typs and specifies how many bytes we are 130 // converting on one iteration of the benchmark for the corresponding type in 131 // typs. 132 numBytes := []int64{ 133 int64(coldata.BatchSize()), 134 fixedLen * int64(coldata.BatchSize()), 135 0, // The number of bytes for decimals will be set below. 136 8 * int64(coldata.BatchSize()), 137 3 * 8 * int64(coldata.BatchSize()), 138 } 139 // Run a benchmark on every type we care about. 140 for typIdx, typ := range typs { 141 batch := coldatatestutils.RandomBatch(testAllocator, rng, []*types.T{typ}, coldata.BatchSize(), 0 /* length */, 0 /* nullProbability */) 142 if batch.Width() != 1 { 143 b.Fatalf("unexpected batch width: %d", batch.Width()) 144 } 145 if typ.Identical(types.Bytes) { 146 // This type has variable length elements, fit all of them to be fixedLen 147 // bytes long so that we can compare results of one benchmark with 148 // another. Since we can't overwrite elements in a Bytes, create a new 149 // one. 150 // TODO(asubiotto): We should probably create some random spec struct that 151 // we pass in to RandomBatch. 152 bytes := batch.ColVec(0).Bytes() 153 newBytes := coldata.NewBytes(bytes.Len()) 154 for i := 0; i < bytes.Len(); i++ { 155 diff := len(bytes.Get(i)) - fixedLen 156 if diff < 0 { 157 newBytes.Set(i, append(bytes.Get(i), make([]byte, -diff)...)) 158 } else if diff >= 0 { 159 newBytes.Set(i, bytes.Get(i)[:fixedLen]) 160 } 161 } 162 batch.ColVec(0).SetCol(newBytes) 163 } else if typ.Identical(types.Decimal) { 164 // Decimal is variable length type, so we want to calculate precisely the 165 // total size of all decimals in the vector. 166 decimals := batch.ColVec(0).Decimal() 167 for _, d := range decimals { 168 marshaled, err := d.MarshalText() 169 require.NoError(b, err) 170 numBytes[typIdx] += int64(len(marshaled)) 171 } 172 } 173 c, err := colserde.NewArrowBatchConverter([]*types.T{typ}) 174 require.NoError(b, err) 175 nullFractions := []float64{0, 0.25, 0.5} 176 setNullFraction := func(batch coldata.Batch, nullFraction float64) { 177 vec := batch.ColVec(0) 178 vec.Nulls().UnsetNulls() 179 numNulls := int(nullFraction * float64(batch.Length())) 180 // Set the first numNulls elements to null. 181 for i := 0; i < batch.Length() && i < numNulls; i++ { 182 vec.Nulls().SetNull(i) 183 } 184 } 185 for _, nullFraction := range nullFractions { 186 setNullFraction(batch, nullFraction) 187 testPrefix := fmt.Sprintf("%s/nullFraction=%0.2f", typ.String(), nullFraction) 188 var data []*array.Data 189 b.Run(testPrefix+"/BatchToArrow", func(b *testing.B) { 190 b.SetBytes(numBytes[typIdx]) 191 for i := 0; i < b.N; i++ { 192 data, _ = c.BatchToArrow(batch) 193 if len(data) != 1 { 194 b.Fatal("expected arrow batch of length 1") 195 } 196 if data[0].Len() != coldata.BatchSize() { 197 b.Fatal("unexpected number of elements") 198 } 199 } 200 }) 201 } 202 for _, nullFraction := range nullFractions { 203 setNullFraction(batch, nullFraction) 204 data, err := c.BatchToArrow(batch) 205 require.NoError(b, err) 206 testPrefix := fmt.Sprintf("%s/nullFraction=%0.2f", typ.String(), nullFraction) 207 result := testAllocator.NewMemBatch([]*types.T{typ}) 208 b.Run(testPrefix+"/ArrowToBatch", func(b *testing.B) { 209 b.SetBytes(numBytes[typIdx]) 210 for i := 0; i < b.N; i++ { 211 // Using require.NoError here causes large enough allocations to 212 // affect the result. 213 if err := c.ArrowToBatch(data, result); err != nil { 214 b.Fatal(err) 215 } 216 if result.Width() != 1 { 217 b.Fatal("expected one column") 218 } 219 if result.Length() != coldata.BatchSize() { 220 b.Fatal("unexpected number of elements") 221 } 222 } 223 }) 224 } 225 } 226 }