github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/testutils/random.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package testutils contains utilities for generating random data and other 18 // helpers that are used for testing the various aspects of the parquet library. 19 package testutils 20 21 import ( 22 "encoding/binary" 23 "math" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v14/arrow" 28 "github.com/apache/arrow/go/v14/arrow/array" 29 "github.com/apache/arrow/go/v14/arrow/bitutil" 30 "github.com/apache/arrow/go/v14/arrow/endian" 31 "github.com/apache/arrow/go/v14/arrow/memory" 32 "github.com/apache/arrow/go/v14/parquet" 33 "github.com/apache/arrow/go/v14/parquet/pqarrow" 34 35 "golang.org/x/exp/rand" 36 "gonum.org/v1/gonum/stat/distuv" 37 ) 38 39 // RandomArrayGenerator is a struct used for constructing Random Arrow arrays 40 // for use with testing. 41 type RandomArrayGenerator struct { 42 seed uint64 43 extra uint64 44 src rand.Source 45 seedRand *rand.Rand 46 } 47 48 // NewRandomArrayGenerator constructs a new generator with the requested Seed 49 func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator { 50 src := rand.NewSource(seed) 51 return RandomArrayGenerator{seed, 0, src, rand.New(src)} 52 } 53 54 // GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability 55 // that a given bit will be zero, with 1-prob being the probability it will be 1. The return value 56 // is the number of bits that were left unset. The assumption being that buffer is currently 57 // zero initialized as this function does not clear any bits, it only sets 1s. 58 func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 { 59 count := int64(0) 60 r.extra++ 61 62 // bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1, 63 // which we'll use to generate the bitmap. 64 dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)} 65 for i := 0; int64(i) < n; i++ { 66 if dist.Rand() != float64(0.0) { 67 bitutil.SetBit(buffer, i) 68 } else { 69 count++ 70 } 71 } 72 73 return count 74 } 75 76 // ByteArray creates an array.String for use of creating random ByteArray values for testing parquet 77 // writing/reading. minLen/maxLen are the min and max length for a given value in the resulting array, 78 // with nullProb being the probability of a given index being null. 79 // 80 // For this generation we only generate ascii values with a min of 'A' and max of 'z'. 81 func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) arrow.Array { 82 if nullProb < 0 || nullProb > 1 { 83 panic("null prob must be between 0 and 1") 84 } 85 86 lengths := r.Int32(size, minLen, maxLen, nullProb) 87 defer lengths.Release() 88 89 r.extra++ 90 dist := rand.New(rand.NewSource(r.seed + r.extra)) 91 bldr := array.NewStringBuilder(memory.DefaultAllocator) 92 defer bldr.Release() 93 94 strbuf := make([]byte, maxLen) 95 96 for i := 0; int64(i) < size; i++ { 97 if lengths.IsValid(i) { 98 l := lengths.Value(i) 99 for j := int32(0); j < l; j++ { 100 strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A')) 101 } 102 val := strbuf[:l] 103 bldr.Append(*(*string)(unsafe.Pointer(&val))) 104 } else { 105 bldr.AppendNull() 106 } 107 } 108 109 return bldr.NewArray() 110 } 111 112 // Uint8 generates a random array.Uint8 of the requested size whose values are between min and max 113 // with prob as the probability that a given index will be null. 114 func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) arrow.Array { 115 buffers := make([]*memory.Buffer, 2) 116 nullCount := int64(0) 117 118 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 119 buffers[0].Resize(int(bitutil.BytesForBits(size))) 120 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob) 121 122 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 123 buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes))) 124 125 r.extra++ 126 dist := rand.New(rand.NewSource(r.seed + r.extra)) 127 out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes()) 128 for i := int64(0); i < size; i++ { 129 out[i] = uint8(dist.Intn(int(max-min+1))) + min 130 } 131 132 return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0)) 133 } 134 135 // Int32 generates a random array.Int32 of the given size with each value between min and max, 136 // and pctNull as the probability that a given index will be null. 137 func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 { 138 buffers := make([]*memory.Buffer, 2) 139 nullCount := int64(0) 140 141 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 142 buffers[0].Resize(int(bitutil.BytesForBits(size))) 143 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull) 144 145 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 146 buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size))) 147 148 r.extra++ 149 dist := rand.New(rand.NewSource(r.seed + r.extra)) 150 out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes()) 151 for i := int64(0); i < size; i++ { 152 out[i] = dist.Int31n(max-min+1) + min 153 } 154 return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0)) 155 } 156 157 // Int64 generates a random array.Int64 of the given size with each value between min and max, 158 // and pctNull as the probability that a given index will be null. 159 func (r *RandomArrayGenerator) Int64(size int64, min, max int64, pctNull float64) *array.Int64 { 160 buffers := make([]*memory.Buffer, 2) 161 nullCount := int64(0) 162 163 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 164 buffers[0].Resize(int(bitutil.BytesForBits(size))) 165 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull) 166 167 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 168 buffers[1].Resize(arrow.Int64Traits.BytesRequired(int(size))) 169 170 r.extra++ 171 dist := rand.New(rand.NewSource(r.seed + r.extra)) 172 out := arrow.Int64Traits.CastFromBytes(buffers[1].Bytes()) 173 for i := int64(0); i < size; i++ { 174 out[i] = dist.Int63n(max-min+1) + min 175 } 176 return array.NewInt64Data(array.NewData(arrow.PrimitiveTypes.Int64, int(size), buffers, nil, int(nullCount), 0)) 177 } 178 179 // Float64 generates a random array.Float64 of the requested size with pctNull as the probability 180 // that a given index will be null. 181 func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 { 182 buffers := make([]*memory.Buffer, 2) 183 nullCount := int64(0) 184 185 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator) 186 buffers[0].Resize(int(bitutil.BytesForBits(size))) 187 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull) 188 189 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator) 190 buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size))) 191 192 r.extra++ 193 dist := rand.New(rand.NewSource(r.seed + r.extra)) 194 out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes()) 195 for i := int64(0); i < size; i++ { 196 out[i] = dist.NormFloat64() 197 } 198 return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0)) 199 } 200 201 func (r *RandomArrayGenerator) StringWithRepeats(mem memory.Allocator, sz, unique int64, minLen, maxLen int32, nullProb float64) *array.String { 202 if unique > sz { 203 panic("invalid config for random StringWithRepeats") 204 } 205 206 // generate a random string dictionary without any nulls 207 arr := r.ByteArray(unique, minLen, maxLen, 0) 208 defer arr.Release() 209 dict := arr.(*array.String) 210 211 // generate random indices to sample dictionary with 212 idArray := r.Int64(sz, 0, unique-1, nullProb) 213 defer idArray.Release() 214 215 bldr := array.NewStringBuilder(mem) 216 defer bldr.Release() 217 218 for i := int64(0); i < sz; i++ { 219 if idArray.IsValid(int(i)) { 220 idx := idArray.Value(int(i)) 221 bldr.Append(dict.Value(int(idx))) 222 } else { 223 bldr.AppendNull() 224 } 225 } 226 227 return bldr.NewStringArray() 228 } 229 230 // FillRandomInt8 populates the slice out with random int8 values between min and max using 231 // seed as the random see for generation to allow consistency for testing. 232 func FillRandomInt8(seed uint64, min, max int8, out []int8) { 233 r := rand.New(rand.NewSource(seed)) 234 for idx := range out { 235 out[idx] = int8(r.Intn(int(max-min+1))) + min 236 } 237 } 238 239 // FillRandomUint8 populates the slice out with random uint8 values between min and max using 240 // seed as the random see for generation to allow consistency for testing. 241 func FillRandomUint8(seed uint64, min, max uint8, out []uint8) { 242 r := rand.New(rand.NewSource(seed)) 243 for idx := range out { 244 out[idx] = uint8(r.Intn(int(max-min+1))) + min 245 } 246 } 247 248 // FillRandomInt16 populates the slice out with random int16 values between min and max using 249 // seed as the random see for generation to allow consistency for testing. 250 func FillRandomInt16(seed uint64, min, max int16, out []int16) { 251 r := rand.New(rand.NewSource(seed)) 252 for idx := range out { 253 out[idx] = int16(r.Intn(int(max-min+1))) + min 254 } 255 } 256 257 // FillRandomUint16 populates the slice out with random uint16 values between min and max using 258 // seed as the random see for generation to allow consistency for testing. 259 func FillRandomUint16(seed uint64, min, max uint16, out []uint16) { 260 r := rand.New(rand.NewSource(seed)) 261 for idx := range out { 262 out[idx] = uint16(r.Intn(int(max-min+1))) + min 263 } 264 } 265 266 // FillRandomInt32 populates out with random int32 values using seed as the random 267 // seed for the generator to allow consistency for testing. 268 func FillRandomInt32(seed uint64, out []int32) { 269 r := rand.New(rand.NewSource(seed)) 270 for idx := range out { 271 out[idx] = int32(r.Uint32()) 272 } 273 } 274 275 // FillRandomInt32Max populates out with random int32 values between 0 and max using seed as the random 276 // seed for the generator to allow consistency for testing. 277 func FillRandomInt32Max(seed uint64, max int32, out []int32) { 278 r := rand.New(rand.NewSource(seed)) 279 for idx := range out { 280 out[idx] = r.Int31n(max) 281 } 282 } 283 284 // FillRandomUint32Max populates out with random uint32 values between 0 and max using seed as the random 285 // seed for the generator to allow consistency for testing. 286 func FillRandomUint32Max(seed uint64, max uint32, out []uint32) { 287 r := rand.New(rand.NewSource(seed)) 288 for idx := range out { 289 out[idx] = uint32(r.Uint64n(uint64(max))) 290 } 291 } 292 293 // FillRandomInt64Max populates out with random int64 values between 0 and max using seed as the random 294 // seed for the generator to allow consistency for testing. 295 func FillRandomInt64Max(seed uint64, max int64, out []int64) { 296 r := rand.New(rand.NewSource(seed)) 297 for idx := range out { 298 out[idx] = r.Int63n(max) 299 } 300 } 301 302 // FillRandomUint32 populates out with random uint32 values using seed as the random 303 // seed for the generator to allow consistency for testing. 304 func FillRandomUint32(seed uint64, out []uint32) { 305 r := rand.New(rand.NewSource(seed)) 306 for idx := range out { 307 out[idx] = r.Uint32() 308 } 309 } 310 311 // FillRandomUint64 populates out with random uint64 values using seed as the random 312 // seed for the generator to allow consistency for testing. 313 func FillRandomUint64(seed uint64, out []uint64) { 314 r := rand.New(rand.NewSource(seed)) 315 for idx := range out { 316 out[idx] = r.Uint64() 317 } 318 } 319 320 // FillRandomUint64Max populates out with random uint64 values between 0 and max using seed as the random 321 // seed for the generator to allow consistency for testing. 322 func FillRandomUint64Max(seed uint64, max uint64, out []uint64) { 323 r := rand.New(rand.NewSource(seed)) 324 for idx := range out { 325 out[idx] = r.Uint64n(max) 326 } 327 } 328 329 // FillRandomInt64 populates out with random int64 values using seed as the random 330 // seed for the generator to allow consistency for testing. 331 func FillRandomInt64(seed uint64, out []int64) { 332 r := rand.New(rand.NewSource(seed)) 333 for idx := range out { 334 out[idx] = int64(r.Uint64()) 335 } 336 } 337 338 // FillRandomInt96 populates out with random Int96 values using seed as the random 339 // seed for the generator to allow consistency for testing. It does this by generating 340 // three random uint32 values for each int96 value. 341 func FillRandomInt96(seed uint64, out []parquet.Int96) { 342 r := rand.New(rand.NewSource(seed)) 343 for idx := range out { 344 *(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32()) 345 *(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32()) 346 *(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32()) 347 } 348 } 349 350 // randFloat32 creates a random float value with a normal distribution 351 // to better spread the values out and ensure we do not return any NaN values. 352 func randFloat32(r *rand.Rand) float32 { 353 for { 354 f := math.Float32frombits(r.Uint32()) 355 if !math.IsNaN(float64(f)) { 356 return f 357 } 358 } 359 } 360 361 // randFloat64 creates a random float value with a normal distribution 362 // to better spread the values out and ensure we do not return any NaN values. 363 func randFloat64(r *rand.Rand) float64 { 364 for { 365 f := math.Float64frombits(r.Uint64()) 366 if !math.IsNaN(f) { 367 return f 368 } 369 } 370 } 371 372 // FillRandomFloat32 populates out with random float32 values using seed as the random 373 // seed for the generator to allow consistency for testing. 374 func FillRandomFloat32(seed uint64, out []float32) { 375 r := rand.New(rand.NewSource(seed)) 376 for idx := range out { 377 out[idx] = randFloat32(r) 378 } 379 } 380 381 // FillRandomFloat64 populates out with random float64 values using seed as the random 382 // seed for the generator to allow consistency for testing. 383 func FillRandomFloat64(seed uint64, out []float64) { 384 r := rand.New(rand.NewSource(seed)) 385 for idx := range out { 386 out[idx] = randFloat64(r) 387 } 388 } 389 390 // FillRandomByteArray populates out with random ByteArray values with lengths between 2 and 12 391 // using heap as the actual memory storage used for the bytes generated. Each element of 392 // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices. 393 func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) { 394 const ( 395 maxByteArrayLen = 12 396 minByteArrayLen = 2 397 ) 398 RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen) 399 } 400 401 // FillRandomFixedByteArray populates out with random FixedLenByteArray values with of a length equal to size 402 // using heap as the actual memory storage used for the bytes generated. Each element of 403 // out will be a slice of size bytes in heap, and as such heap must outlive the byte array slices. 404 func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) { 405 heap.Resize(len(out) * size) 406 407 buf := heap.Bytes() 408 r := rand.New(rand.NewSource(seed)) 409 for idx := range out { 410 r.Read(buf[:size]) 411 out[idx] = buf[:size] 412 buf = buf[size:] 413 } 414 } 415 416 // FillRandomBooleans populates out with random bools with the probability p of being false using 417 // seed as the random seed to the generator in order to allow consistency for testing. This uses 418 // a Bernoulli distribution of values. 419 func FillRandomBooleans(p float64, seed uint64, out []bool) { 420 dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)} 421 for idx := range out { 422 out[idx] = dist.Rand() != float64(0.0) 423 } 424 } 425 426 // fillRandomIsValid populates out with random bools with the probability pctNull of being false using 427 // seed as the random seed to the generator in order to allow consistency for testing. This uses 428 // the default Golang random generator distribution of float64 values between 0 and 1 comparing against 429 // pctNull. If the random value is > pctNull, it is true. 430 func fillRandomIsValid(seed uint64, pctNull float64, out []bool) { 431 r := rand.New(rand.NewSource(seed)) 432 for idx := range out { 433 out[idx] = r.Float64() > pctNull 434 } 435 } 436 437 // InitValues is a convenience function for generating a slice of random values based on the type. 438 // If the type is parquet.ByteArray or parquet.FixedLenByteArray, heap must not be null. 439 // 440 // The default values are: 441 // []bool uses the current time as the seed with only values of 1 being false, for use 442 // of creating validity boolean slices. 443 // all other types use 0 as the seed 444 // a []parquet.ByteArray is populated with lengths between 2 and 12 445 // a []parquet.FixedLenByteArray is populated with fixed size random byte arrays of length 12. 446 func InitValues(values interface{}, heap *memory.Buffer) { 447 switch arr := values.(type) { 448 case []bool: 449 fillRandomIsValid(uint64(time.Now().Unix()), 1.0, arr) 450 case []int32: 451 FillRandomInt32(0, arr) 452 case []int64: 453 FillRandomInt64(0, arr) 454 case []float32: 455 FillRandomFloat32(0, arr) 456 case []float64: 457 FillRandomFloat64(0, arr) 458 case []parquet.Int96: 459 FillRandomInt96(0, arr) 460 case []parquet.ByteArray: 461 FillRandomByteArray(0, arr, heap) 462 case []parquet.FixedLenByteArray: 463 FillRandomFixedByteArray(0, arr, heap, 12) 464 } 465 } 466 467 // RandomByteArray populates out with random ByteArray values with lengths between minlen and maxlen 468 // using heap as the actual memory storage used for the bytes generated. Each element of 469 // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices. 470 func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) { 471 heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes)) 472 473 buf := heap.Bytes() 474 r := rand.New(rand.NewSource(seed)) 475 for idx := range out { 476 length := r.Intn(maxlen-minlen+1) + minlen 477 r.Read(buf[:length]) 478 out[idx] = buf[:length] 479 480 buf = buf[length:] 481 } 482 } 483 484 // RandomDecimals generates n random decimal values with precision determining the byte width 485 // for the values and seed as the random generator seed to allow consistency for testing. The 486 // resulting values will be either 32 bytes or 16 bytes each depending on the precision. 487 func RandomDecimals(n int64, seed uint64, precision int32) []byte { 488 r := rand.New(rand.NewSource(seed)) 489 nreqBytes := pqarrow.DecimalSize(precision) 490 byteWidth := 32 491 if precision <= 38 { 492 byteWidth = 16 493 } 494 495 out := make([]byte, int(int64(byteWidth)*n)) 496 for i := int64(0); i < n; i++ { 497 start := int(i) * byteWidth 498 r.Read(out[start : start+int(nreqBytes)]) 499 // sign extend if the sign bit is set for the last generated byte 500 // 0b10000000 == 0x80 == 128 501 if out[start+int(nreqBytes)-1]&byte(0x80) != 0 { 502 memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF) 503 } 504 505 // byte swap for big endian 506 if endian.IsBigEndian { 507 for j := 0; j+8 <= byteWidth; j += 8 { 508 v := binary.LittleEndian.Uint64(out[start+j : start+j+8]) 509 binary.BigEndian.PutUint64(out[start+j:start+j+8], v) 510 } 511 } 512 } 513 return out 514 }