github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/testutils/random.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package testutils contains utilities for generating random data and other 18 // helpers that are used for testing the various aspects of the parquet library. 19 package testutils 20 21 import ( 22 "encoding/binary" 23 "math" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v7/arrow" 28 "github.com/apache/arrow/go/v7/arrow/array" 29 "github.com/apache/arrow/go/v7/arrow/bitutil" 30 "github.com/apache/arrow/go/v7/arrow/endian" 31 "github.com/apache/arrow/go/v7/arrow/memory" 32 "github.com/apache/arrow/go/v7/parquet" 33 "github.com/apache/arrow/go/v7/parquet/pqarrow" 34 35 "golang.org/x/exp/rand" 36 "gonum.org/v1/gonum/stat/distuv" 37 ) 38 39 // RandomArrayGenerator is a struct used for constructing Random Arrow arrays 40 // for use with testing. 41 type RandomArrayGenerator struct { 42 seed uint64 43 extra uint64 44 src rand.Source 45 seedRand *rand.Rand 46 } 47 48 // NewRandomArrayGenerator constructs a new generator with the requested Seed 49 func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator { 50 src := rand.NewSource(seed) 51 return RandomArrayGenerator{seed, 0, src, rand.New(src)} 52 } 53 54 // GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability 55 // that a given bit will be zero, with 1-prob being the probability it will be 1. The return value 56 // is the number of bits that were left unset. The assumption being that buffer is currently 57 // zero initialized as this function does not clear any bits, it only sets 1s. 58 func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 { 59 count := int64(0) 60 r.extra++ 61 62 // bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1, 63 // which we'll use to generate the bitmap. 64 dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)} 65 for i := 0; int64(i) < n; i++ { 66 if dist.Rand() != float64(0.0) { 67 bitutil.SetBit(buffer, i) 68 } else { 69 count++ 70 } 71 } 72 73 return count 74 } 75 76 // ByteArray creates an array.String for use of creating random ByteArray values for testing parquet 77 // writing/reading. minLen/maxLen are the min and max length for a given value in the resulting array, 78 // with nullProb being the probability of a given index being null. 79 // 80 // For this generation we only generate ascii values with a min of 'A' and max of 'z'. 81 func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) arrow.Array { 82 if nullProb < 0 || nullProb > 1 { 83 panic("null prob must be between 0 and 1") 84 } 85 86 lengths := r.Int32(size, minLen, maxLen, nullProb) 87 defer lengths.Release() 88 89 r.extra++ 90 dist := rand.New(rand.NewSource(r.seed + r.extra)) 91 bldr := array.NewStringBuilder(memory.DefaultAllocator) 92 defer bldr.Release() 93 94 strbuf := make([]byte, maxLen) 95 96 for i := 0; int64(i) < size; i++ { 97 if lengths.IsValid(i) { 98 l := lengths.Value(i) 99 for j := int32(0); j < l; j++ { 100 strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A')) 101 } 102 val := strbuf[:l] 103 bldr.Append(*(*string)(unsafe.Pointer(&val))) 104 } else { 105 bldr.AppendNull() 106 } 107 } 108 109 return bldr.NewArray() 110 } 111 112 // Uint8 generates a random array.Uint8 of the requested size whose values are between min and max 113 // with prob as the probability that a given index will be null. 114 func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) arrow.Array { 115 buffers := make([]*memory.Buffer, 2) 116 nullCount := int64(0) 117 118 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 119 buffers[0].Resize(int(bitutil.BytesForBits(size))) 120 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob) 121 122 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 123 buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes))) 124 125 r.extra++ 126 dist := rand.New(rand.NewSource(r.seed + r.extra)) 127 out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes()) 128 for i := int64(0); i < size; i++ { 129 out[i] = uint8(dist.Intn(int(max-min+1))) + min 130 } 131 132 return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0)) 133 } 134 135 // Int32 generates a random array.Int32 of the given size with each value between min and max, 136 // and pctNull as the probability that a given index will be null. 137 func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 { 138 buffers := make([]*memory.Buffer, 2) 139 nullCount := int64(0) 140 141 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 142 buffers[0].Resize(int(bitutil.BytesForBits(size))) 143 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull) 144 145 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 146 buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size))) 147 148 r.extra++ 149 dist := rand.New(rand.NewSource(r.seed + r.extra)) 150 out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes()) 151 for i := int64(0); i < size; i++ { 152 out[i] = dist.Int31n(max-min+1) + min 153 } 154 return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0)) 155 } 156 157 // Float64 generates a random array.Float64 of the requested size with pctNull as the probability 158 // that a given index will be null. 159 func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 { 160 buffers := make([]*memory.Buffer, 2) 161 nullCount := int64(0) 162 163 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 164 buffers[0].Resize(int(bitutil.BytesForBits(size))) 165 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull) 166 167 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 168 buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size))) 169 170 r.extra++ 171 dist := rand.New(rand.NewSource(r.seed + r.extra)) 172 out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes()) 173 for i := int64(0); i < size; i++ { 174 out[i] = dist.NormFloat64() 175 } 176 return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0)) 177 } 178 179 // FillRandomInt8 populates the slice out with random int8 values between min and max using 180 // seed as the random see for generation to allow consistency for testing. 181 func FillRandomInt8(seed uint64, min, max int8, out []int8) { 182 r := rand.New(rand.NewSource(seed)) 183 for idx := range out { 184 out[idx] = int8(r.Intn(int(max-min+1))) + min 185 } 186 } 187 188 // FillRandomUint8 populates the slice out with random uint8 values between min and max using 189 // seed as the random see for generation to allow consistency for testing. 190 func FillRandomUint8(seed uint64, min, max uint8, out []uint8) { 191 r := rand.New(rand.NewSource(seed)) 192 for idx := range out { 193 out[idx] = uint8(r.Intn(int(max-min+1))) + min 194 } 195 } 196 197 // FillRandomInt16 populates the slice out with random int16 values between min and max using 198 // seed as the random see for generation to allow consistency for testing. 199 func FillRandomInt16(seed uint64, min, max int16, out []int16) { 200 r := rand.New(rand.NewSource(seed)) 201 for idx := range out { 202 out[idx] = int16(r.Intn(int(max-min+1))) + min 203 } 204 } 205 206 // FillRandomUint16 populates the slice out with random uint16 values between min and max using 207 // seed as the random see for generation to allow consistency for testing. 208 func FillRandomUint16(seed uint64, min, max uint16, out []uint16) { 209 r := rand.New(rand.NewSource(seed)) 210 for idx := range out { 211 out[idx] = uint16(r.Intn(int(max-min+1))) + min 212 } 213 } 214 215 // FillRandomInt32 populates out with random int32 values using seed as the random 216 // seed for the generator to allow consistency for testing. 217 func FillRandomInt32(seed uint64, out []int32) { 218 r := rand.New(rand.NewSource(seed)) 219 for idx := range out { 220 out[idx] = int32(r.Uint32()) 221 } 222 } 223 224 // FillRandomInt32Max populates out with random int32 values between 0 and max using seed as the random 225 // seed for the generator to allow consistency for testing. 226 func FillRandomInt32Max(seed uint64, max int32, out []int32) { 227 r := rand.New(rand.NewSource(seed)) 228 for idx := range out { 229 out[idx] = r.Int31n(max) 230 } 231 } 232 233 // FillRandomUint32Max populates out with random uint32 values between 0 and max using seed as the random 234 // seed for the generator to allow consistency for testing. 235 func FillRandomUint32Max(seed uint64, max uint32, out []uint32) { 236 r := rand.New(rand.NewSource(seed)) 237 for idx := range out { 238 out[idx] = uint32(r.Uint64n(uint64(max))) 239 } 240 } 241 242 // FillRandomInt64Max populates out with random int64 values between 0 and max using seed as the random 243 // seed for the generator to allow consistency for testing. 244 func FillRandomInt64Max(seed uint64, max int64, out []int64) { 245 r := rand.New(rand.NewSource(seed)) 246 for idx := range out { 247 out[idx] = r.Int63n(max) 248 } 249 } 250 251 // FillRandomUint32 populates out with random uint32 values using seed as the random 252 // seed for the generator to allow consistency for testing. 253 func FillRandomUint32(seed uint64, out []uint32) { 254 r := rand.New(rand.NewSource(seed)) 255 for idx := range out { 256 out[idx] = r.Uint32() 257 } 258 } 259 260 // FillRandomUint64 populates out with random uint64 values using seed as the random 261 // seed for the generator to allow consistency for testing. 262 func FillRandomUint64(seed uint64, out []uint64) { 263 r := rand.New(rand.NewSource(seed)) 264 for idx := range out { 265 out[idx] = r.Uint64() 266 } 267 } 268 269 // FillRandomUint64Max populates out with random uint64 values between 0 and max using seed as the random 270 // seed for the generator to allow consistency for testing. 271 func FillRandomUint64Max(seed uint64, max uint64, out []uint64) { 272 r := rand.New(rand.NewSource(seed)) 273 for idx := range out { 274 out[idx] = r.Uint64n(max) 275 } 276 } 277 278 // FillRandomInt64 populates out with random int64 values using seed as the random 279 // seed for the generator to allow consistency for testing. 280 func FillRandomInt64(seed uint64, out []int64) { 281 r := rand.New(rand.NewSource(seed)) 282 for idx := range out { 283 out[idx] = int64(r.Uint64()) 284 } 285 } 286 287 // FillRandomInt96 populates out with random Int96 values using seed as the random 288 // seed for the generator to allow consistency for testing. It does this by generating 289 // three random uint32 values for each int96 value. 290 func FillRandomInt96(seed uint64, out []parquet.Int96) { 291 r := rand.New(rand.NewSource(seed)) 292 for idx := range out { 293 *(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32()) 294 *(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32()) 295 *(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32()) 296 } 297 } 298 299 // randFloat32 creates a random float value with a normal distribution 300 // to better spread the values out and ensure we do not return any NaN values. 301 func randFloat32(r *rand.Rand) float32 { 302 for { 303 f := math.Float32frombits(r.Uint32()) 304 if !math.IsNaN(float64(f)) { 305 return f 306 } 307 } 308 } 309 310 // randFloat64 creates a random float value with a normal distribution 311 // to better spread the values out and ensure we do not return any NaN values. 312 func randFloat64(r *rand.Rand) float64 { 313 for { 314 f := math.Float64frombits(r.Uint64()) 315 if !math.IsNaN(f) { 316 return f 317 } 318 } 319 } 320 321 // FillRandomFloat32 populates out with random float32 values using seed as the random 322 // seed for the generator to allow consistency for testing. 323 func FillRandomFloat32(seed uint64, out []float32) { 324 r := rand.New(rand.NewSource(seed)) 325 for idx := range out { 326 out[idx] = randFloat32(r) 327 } 328 } 329 330 // FillRandomFloat64 populates out with random float64 values using seed as the random 331 // seed for the generator to allow consistency for testing. 332 func FillRandomFloat64(seed uint64, out []float64) { 333 r := rand.New(rand.NewSource(seed)) 334 for idx := range out { 335 out[idx] = randFloat64(r) 336 } 337 } 338 339 // FillRandomByteArray populates out with random ByteArray values with lengths between 2 and 12 340 // using heap as the actual memory storage used for the bytes generated. Each element of 341 // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices. 342 func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) { 343 const ( 344 maxByteArrayLen = 12 345 minByteArrayLen = 2 346 ) 347 RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen) 348 } 349 350 // FillRandomFixedByteArray populates out with random FixedLenByteArray values with of a length equal to size 351 // using heap as the actual memory storage used for the bytes generated. Each element of 352 // out will be a slice of size bytes in heap, and as such heap must outlive the byte array slices. 353 func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) { 354 heap.Resize(len(out) * size) 355 356 buf := heap.Bytes() 357 r := rand.New(rand.NewSource(seed)) 358 for idx := range out { 359 r.Read(buf[:size]) 360 out[idx] = buf[:size] 361 buf = buf[size:] 362 } 363 } 364 365 // FillRandomBooleans populates out with random bools with the probability p of being false using 366 // seed as the random seed to the generator in order to allow consistency for testing. This uses 367 // a Bernoulli distribution of values. 368 func FillRandomBooleans(p float64, seed uint64, out []bool) { 369 dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)} 370 for idx := range out { 371 out[idx] = dist.Rand() != float64(0.0) 372 } 373 } 374 375 // fillRandomIsValid populates out with random bools with the probability pctNull of being false using 376 // seed as the random seed to the generator in order to allow consistency for testing. This uses 377 // the default Golang random generator distribution of float64 values between 0 and 1 comparing against 378 // pctNull. If the random value is > pctNull, it is true. 379 func fillRandomIsValid(seed uint64, pctNull float64, out []bool) { 380 r := rand.New(rand.NewSource(seed)) 381 for idx := range out { 382 out[idx] = r.Float64() > pctNull 383 } 384 } 385 386 // InitValues is a convenience function for generating a slice of random values based on the type. 387 // If the type is parquet.ByteArray or parquet.FixedLenByteArray, heap must not be null. 388 // 389 // The default values are: 390 // []bool uses the current time as the seed with only values of 1 being false, for use 391 // of creating validity boolean slices. 392 // all other types use 0 as the seed 393 // a []parquet.ByteArray is populated with lengths between 2 and 12 394 // a []parquet.FixedLenByteArray is populated with fixed size random byte arrays of length 12. 395 func InitValues(values interface{}, heap *memory.Buffer) { 396 switch arr := values.(type) { 397 case []bool: 398 fillRandomIsValid(uint64(time.Now().Unix()), 1.0, arr) 399 case []int32: 400 FillRandomInt32(0, arr) 401 case []int64: 402 FillRandomInt64(0, arr) 403 case []float32: 404 FillRandomFloat32(0, arr) 405 case []float64: 406 FillRandomFloat64(0, arr) 407 case []parquet.Int96: 408 FillRandomInt96(0, arr) 409 case []parquet.ByteArray: 410 FillRandomByteArray(0, arr, heap) 411 case []parquet.FixedLenByteArray: 412 FillRandomFixedByteArray(0, arr, heap, 12) 413 } 414 } 415 416 // RandomByteArray populates out with random ByteArray values with lengths between minlen and maxlen 417 // using heap as the actual memory storage used for the bytes generated. Each element of 418 // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices. 419 func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) { 420 heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes)) 421 422 buf := heap.Bytes() 423 r := rand.New(rand.NewSource(seed)) 424 for idx := range out { 425 length := r.Intn(maxlen-minlen+1) + minlen 426 r.Read(buf[:length]) 427 out[idx] = buf[:length] 428 429 buf = buf[length:] 430 } 431 } 432 433 // RandomDecimals generates n random decimal values with precision determining the byte width 434 // for the values and seed as the random generator seed to allow consistency for testing. The 435 // resulting values will be either 32 bytes or 16 bytes each depending on the precision. 436 func RandomDecimals(n int64, seed uint64, precision int32) []byte { 437 r := rand.New(rand.NewSource(seed)) 438 nreqBytes := pqarrow.DecimalSize(precision) 439 byteWidth := 32 440 if precision <= 38 { 441 byteWidth = 16 442 } 443 444 out := make([]byte, int(int64(byteWidth)*n)) 445 for i := int64(0); i < n; i++ { 446 start := int(i) * byteWidth 447 r.Read(out[start : start+int(nreqBytes)]) 448 // sign extend if the sign bit is set for the last generated byte 449 // 0b10000000 == 0x80 == 128 450 if out[start+int(nreqBytes)-1]&byte(0x80) != 0 { 451 memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF) 452 } 453 454 // byte swap for big endian 455 if endian.IsBigEndian { 456 for j := 0; j+8 <= byteWidth; j += 8 { 457 v := binary.LittleEndian.Uint64(out[start+j : start+j+8]) 458 binary.BigEndian.PutUint64(out[start+j:start+j+8], v) 459 } 460 } 461 } 462 return out 463 }