github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/testutils/random.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package testutils contains utilities for generating random data and other
    18  // helpers that are used for testing the various aspects of the parquet library.
    19  package testutils
    20  
    21  import (
    22  	"encoding/binary"
    23  	"math"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v7/arrow"
    28  	"github.com/apache/arrow/go/v7/arrow/array"
    29  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    30  	"github.com/apache/arrow/go/v7/arrow/endian"
    31  	"github.com/apache/arrow/go/v7/arrow/memory"
    32  	"github.com/apache/arrow/go/v7/parquet"
    33  	"github.com/apache/arrow/go/v7/parquet/pqarrow"
    34  
    35  	"golang.org/x/exp/rand"
    36  	"gonum.org/v1/gonum/stat/distuv"
    37  )
    38  
    39  // RandomArrayGenerator is a struct used for constructing Random Arrow arrays
    40  // for use with testing.
    41  type RandomArrayGenerator struct {
    42  	seed     uint64
    43  	extra    uint64
    44  	src      rand.Source
    45  	seedRand *rand.Rand
    46  }
    47  
    48  // NewRandomArrayGenerator constructs a new generator with the requested Seed
    49  func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator {
    50  	src := rand.NewSource(seed)
    51  	return RandomArrayGenerator{seed, 0, src, rand.New(src)}
    52  }
    53  
    54  // GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability
    55  // that a given bit will be zero, with 1-prob being the probability it will be 1. The return value
    56  // is the number of bits that were left unset. The assumption being that buffer is currently
    57  // zero initialized as this function does not clear any bits, it only sets 1s.
    58  func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 {
    59  	count := int64(0)
    60  	r.extra++
    61  
    62  	// bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1,
    63  	// which we'll use to generate the bitmap.
    64  	dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)}
    65  	for i := 0; int64(i) < n; i++ {
    66  		if dist.Rand() != float64(0.0) {
    67  			bitutil.SetBit(buffer, i)
    68  		} else {
    69  			count++
    70  		}
    71  	}
    72  
    73  	return count
    74  }
    75  
    76  // ByteArray creates an array.String for use of creating random ByteArray values for testing parquet
    77  // writing/reading. minLen/maxLen are the min and max length for a given value in the resulting array,
    78  // with nullProb being the probability of a given index being null.
    79  //
    80  // For this generation we only generate ascii values with a min of 'A' and max of 'z'.
    81  func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) arrow.Array {
    82  	if nullProb < 0 || nullProb > 1 {
    83  		panic("null prob must be between 0 and 1")
    84  	}
    85  
    86  	lengths := r.Int32(size, minLen, maxLen, nullProb)
    87  	defer lengths.Release()
    88  
    89  	r.extra++
    90  	dist := rand.New(rand.NewSource(r.seed + r.extra))
    91  	bldr := array.NewStringBuilder(memory.DefaultAllocator)
    92  	defer bldr.Release()
    93  
    94  	strbuf := make([]byte, maxLen)
    95  
    96  	for i := 0; int64(i) < size; i++ {
    97  		if lengths.IsValid(i) {
    98  			l := lengths.Value(i)
    99  			for j := int32(0); j < l; j++ {
   100  				strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
   101  			}
   102  			val := strbuf[:l]
   103  			bldr.Append(*(*string)(unsafe.Pointer(&val)))
   104  		} else {
   105  			bldr.AppendNull()
   106  		}
   107  	}
   108  
   109  	return bldr.NewArray()
   110  }
   111  
   112  // Uint8 generates a random array.Uint8 of the requested size whose values are between min and max
   113  // with prob as the probability that a given index will be null.
   114  func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) arrow.Array {
   115  	buffers := make([]*memory.Buffer, 2)
   116  	nullCount := int64(0)
   117  
   118  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   119  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   120  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob)
   121  
   122  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   123  	buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes)))
   124  
   125  	r.extra++
   126  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   127  	out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes())
   128  	for i := int64(0); i < size; i++ {
   129  		out[i] = uint8(dist.Intn(int(max-min+1))) + min
   130  	}
   131  
   132  	return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0))
   133  }
   134  
   135  // Int32 generates a random array.Int32 of the given size with each value between min and max,
   136  // and pctNull as the probability that a given index will be null.
   137  func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 {
   138  	buffers := make([]*memory.Buffer, 2)
   139  	nullCount := int64(0)
   140  
   141  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   142  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   143  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
   144  
   145  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   146  	buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size)))
   147  
   148  	r.extra++
   149  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   150  	out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
   151  	for i := int64(0); i < size; i++ {
   152  		out[i] = dist.Int31n(max-min+1) + min
   153  	}
   154  	return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0))
   155  }
   156  
   157  // Float64 generates a random array.Float64 of the requested size with pctNull as the probability
   158  // that a given index will be null.
   159  func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 {
   160  	buffers := make([]*memory.Buffer, 2)
   161  	nullCount := int64(0)
   162  
   163  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   164  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   165  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
   166  
   167  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   168  	buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size)))
   169  
   170  	r.extra++
   171  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   172  	out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes())
   173  	for i := int64(0); i < size; i++ {
   174  		out[i] = dist.NormFloat64()
   175  	}
   176  	return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0))
   177  }
   178  
   179  // FillRandomInt8 populates the slice out with random int8 values between min and max using
   180  // seed as the random see for generation to allow consistency for testing.
   181  func FillRandomInt8(seed uint64, min, max int8, out []int8) {
   182  	r := rand.New(rand.NewSource(seed))
   183  	for idx := range out {
   184  		out[idx] = int8(r.Intn(int(max-min+1))) + min
   185  	}
   186  }
   187  
   188  // FillRandomUint8 populates the slice out with random uint8 values between min and max using
   189  // seed as the random see for generation to allow consistency for testing.
   190  func FillRandomUint8(seed uint64, min, max uint8, out []uint8) {
   191  	r := rand.New(rand.NewSource(seed))
   192  	for idx := range out {
   193  		out[idx] = uint8(r.Intn(int(max-min+1))) + min
   194  	}
   195  }
   196  
   197  // FillRandomInt16 populates the slice out with random int16 values between min and max using
   198  // seed as the random see for generation to allow consistency for testing.
   199  func FillRandomInt16(seed uint64, min, max int16, out []int16) {
   200  	r := rand.New(rand.NewSource(seed))
   201  	for idx := range out {
   202  		out[idx] = int16(r.Intn(int(max-min+1))) + min
   203  	}
   204  }
   205  
   206  // FillRandomUint16 populates the slice out with random uint16 values between min and max using
   207  // seed as the random see for generation to allow consistency for testing.
   208  func FillRandomUint16(seed uint64, min, max uint16, out []uint16) {
   209  	r := rand.New(rand.NewSource(seed))
   210  	for idx := range out {
   211  		out[idx] = uint16(r.Intn(int(max-min+1))) + min
   212  	}
   213  }
   214  
   215  // FillRandomInt32 populates out with random int32 values using seed as the random
   216  // seed for the generator to allow consistency for testing.
   217  func FillRandomInt32(seed uint64, out []int32) {
   218  	r := rand.New(rand.NewSource(seed))
   219  	for idx := range out {
   220  		out[idx] = int32(r.Uint32())
   221  	}
   222  }
   223  
   224  // FillRandomInt32Max populates out with random int32 values between 0 and max using seed as the random
   225  // seed for the generator to allow consistency for testing.
   226  func FillRandomInt32Max(seed uint64, max int32, out []int32) {
   227  	r := rand.New(rand.NewSource(seed))
   228  	for idx := range out {
   229  		out[idx] = r.Int31n(max)
   230  	}
   231  }
   232  
   233  // FillRandomUint32Max populates out with random uint32 values between 0 and max using seed as the random
   234  // seed for the generator to allow consistency for testing.
   235  func FillRandomUint32Max(seed uint64, max uint32, out []uint32) {
   236  	r := rand.New(rand.NewSource(seed))
   237  	for idx := range out {
   238  		out[idx] = uint32(r.Uint64n(uint64(max)))
   239  	}
   240  }
   241  
   242  // FillRandomInt64Max populates out with random int64 values between 0 and max using seed as the random
   243  // seed for the generator to allow consistency for testing.
   244  func FillRandomInt64Max(seed uint64, max int64, out []int64) {
   245  	r := rand.New(rand.NewSource(seed))
   246  	for idx := range out {
   247  		out[idx] = r.Int63n(max)
   248  	}
   249  }
   250  
   251  // FillRandomUint32 populates out with random uint32 values using seed as the random
   252  // seed for the generator to allow consistency for testing.
   253  func FillRandomUint32(seed uint64, out []uint32) {
   254  	r := rand.New(rand.NewSource(seed))
   255  	for idx := range out {
   256  		out[idx] = r.Uint32()
   257  	}
   258  }
   259  
   260  // FillRandomUint64 populates out with random uint64 values using seed as the random
   261  // seed for the generator to allow consistency for testing.
   262  func FillRandomUint64(seed uint64, out []uint64) {
   263  	r := rand.New(rand.NewSource(seed))
   264  	for idx := range out {
   265  		out[idx] = r.Uint64()
   266  	}
   267  }
   268  
   269  // FillRandomUint64Max populates out with random uint64 values between 0 and max using seed as the random
   270  // seed for the generator to allow consistency for testing.
   271  func FillRandomUint64Max(seed uint64, max uint64, out []uint64) {
   272  	r := rand.New(rand.NewSource(seed))
   273  	for idx := range out {
   274  		out[idx] = r.Uint64n(max)
   275  	}
   276  }
   277  
   278  // FillRandomInt64 populates out with random int64 values using seed as the random
   279  // seed for the generator to allow consistency for testing.
   280  func FillRandomInt64(seed uint64, out []int64) {
   281  	r := rand.New(rand.NewSource(seed))
   282  	for idx := range out {
   283  		out[idx] = int64(r.Uint64())
   284  	}
   285  }
   286  
   287  // FillRandomInt96 populates out with random Int96 values using seed as the random
   288  // seed for the generator to allow consistency for testing. It does this by generating
   289  // three random uint32 values for each int96 value.
   290  func FillRandomInt96(seed uint64, out []parquet.Int96) {
   291  	r := rand.New(rand.NewSource(seed))
   292  	for idx := range out {
   293  		*(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32())
   294  		*(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32())
   295  		*(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32())
   296  	}
   297  }
   298  
   299  // randFloat32 creates a random float value with a normal distribution
   300  // to better spread the values out and ensure we do not return any NaN values.
   301  func randFloat32(r *rand.Rand) float32 {
   302  	for {
   303  		f := math.Float32frombits(r.Uint32())
   304  		if !math.IsNaN(float64(f)) {
   305  			return f
   306  		}
   307  	}
   308  }
   309  
   310  // randFloat64 creates a random float value with a normal distribution
   311  // to better spread the values out and ensure we do not return any NaN values.
   312  func randFloat64(r *rand.Rand) float64 {
   313  	for {
   314  		f := math.Float64frombits(r.Uint64())
   315  		if !math.IsNaN(f) {
   316  			return f
   317  		}
   318  	}
   319  }
   320  
   321  // FillRandomFloat32 populates out with random float32 values using seed as the random
   322  // seed for the generator to allow consistency for testing.
   323  func FillRandomFloat32(seed uint64, out []float32) {
   324  	r := rand.New(rand.NewSource(seed))
   325  	for idx := range out {
   326  		out[idx] = randFloat32(r)
   327  	}
   328  }
   329  
   330  // FillRandomFloat64 populates out with random float64 values using seed as the random
   331  // seed for the generator to allow consistency for testing.
   332  func FillRandomFloat64(seed uint64, out []float64) {
   333  	r := rand.New(rand.NewSource(seed))
   334  	for idx := range out {
   335  		out[idx] = randFloat64(r)
   336  	}
   337  }
   338  
   339  // FillRandomByteArray populates out with random ByteArray values with lengths between 2 and 12
   340  // using heap as the actual memory storage used for the bytes generated. Each element of
   341  // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
   342  func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) {
   343  	const (
   344  		maxByteArrayLen = 12
   345  		minByteArrayLen = 2
   346  	)
   347  	RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen)
   348  }
   349  
   350  // FillRandomFixedByteArray populates out with random FixedLenByteArray values with of a length equal to size
   351  // using heap as the actual memory storage used for the bytes generated. Each element of
   352  // out will be a slice of size bytes in heap, and as such heap must outlive the byte array slices.
   353  func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) {
   354  	heap.Resize(len(out) * size)
   355  
   356  	buf := heap.Bytes()
   357  	r := rand.New(rand.NewSource(seed))
   358  	for idx := range out {
   359  		r.Read(buf[:size])
   360  		out[idx] = buf[:size]
   361  		buf = buf[size:]
   362  	}
   363  }
   364  
   365  // FillRandomBooleans populates out with random bools with the probability p of being false using
   366  // seed as the random seed to the generator in order to allow consistency for testing. This uses
   367  // a Bernoulli distribution of values.
   368  func FillRandomBooleans(p float64, seed uint64, out []bool) {
   369  	dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)}
   370  	for idx := range out {
   371  		out[idx] = dist.Rand() != float64(0.0)
   372  	}
   373  }
   374  
   375  // fillRandomIsValid populates out with random bools with the probability pctNull of being false using
   376  // seed as the random seed to the generator in order to allow consistency for testing. This uses
   377  // the default Golang random generator distribution of float64 values between 0 and 1 comparing against
   378  // pctNull. If the random value is > pctNull, it is true.
   379  func fillRandomIsValid(seed uint64, pctNull float64, out []bool) {
   380  	r := rand.New(rand.NewSource(seed))
   381  	for idx := range out {
   382  		out[idx] = r.Float64() > pctNull
   383  	}
   384  }
   385  
   386  // InitValues is a convenience function for generating a slice of random values based on the type.
   387  // If the type is parquet.ByteArray or parquet.FixedLenByteArray, heap must not be null.
   388  //
   389  // The default values are:
   390  //  []bool uses the current time as the seed with only values of 1 being false, for use
   391  //   of creating validity boolean slices.
   392  //  all other types use 0 as the seed
   393  //  a []parquet.ByteArray is populated with lengths between 2 and 12
   394  //  a []parquet.FixedLenByteArray is populated with fixed size random byte arrays of length 12.
   395  func InitValues(values interface{}, heap *memory.Buffer) {
   396  	switch arr := values.(type) {
   397  	case []bool:
   398  		fillRandomIsValid(uint64(time.Now().Unix()), 1.0, arr)
   399  	case []int32:
   400  		FillRandomInt32(0, arr)
   401  	case []int64:
   402  		FillRandomInt64(0, arr)
   403  	case []float32:
   404  		FillRandomFloat32(0, arr)
   405  	case []float64:
   406  		FillRandomFloat64(0, arr)
   407  	case []parquet.Int96:
   408  		FillRandomInt96(0, arr)
   409  	case []parquet.ByteArray:
   410  		FillRandomByteArray(0, arr, heap)
   411  	case []parquet.FixedLenByteArray:
   412  		FillRandomFixedByteArray(0, arr, heap, 12)
   413  	}
   414  }
   415  
   416  // RandomByteArray populates out with random ByteArray values with lengths between minlen and maxlen
   417  // using heap as the actual memory storage used for the bytes generated. Each element of
   418  // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
   419  func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) {
   420  	heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes))
   421  
   422  	buf := heap.Bytes()
   423  	r := rand.New(rand.NewSource(seed))
   424  	for idx := range out {
   425  		length := r.Intn(maxlen-minlen+1) + minlen
   426  		r.Read(buf[:length])
   427  		out[idx] = buf[:length]
   428  
   429  		buf = buf[length:]
   430  	}
   431  }
   432  
   433  // RandomDecimals generates n random decimal values with precision determining the byte width
   434  // for the values and seed as the random generator seed to allow consistency for testing. The
   435  // resulting values will be either 32 bytes or 16 bytes each depending on the precision.
   436  func RandomDecimals(n int64, seed uint64, precision int32) []byte {
   437  	r := rand.New(rand.NewSource(seed))
   438  	nreqBytes := pqarrow.DecimalSize(precision)
   439  	byteWidth := 32
   440  	if precision <= 38 {
   441  		byteWidth = 16
   442  	}
   443  
   444  	out := make([]byte, int(int64(byteWidth)*n))
   445  	for i := int64(0); i < n; i++ {
   446  		start := int(i) * byteWidth
   447  		r.Read(out[start : start+int(nreqBytes)])
   448  		// sign extend if the sign bit is set for the last generated byte
   449  		// 0b10000000 == 0x80 == 128
   450  		if out[start+int(nreqBytes)-1]&byte(0x80) != 0 {
   451  			memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF)
   452  		}
   453  
   454  		// byte swap for big endian
   455  		if endian.IsBigEndian {
   456  			for j := 0; j+8 <= byteWidth; j += 8 {
   457  				v := binary.LittleEndian.Uint64(out[start+j : start+j+8])
   458  				binary.BigEndian.PutUint64(out[start+j:start+j+8], v)
   459  			}
   460  		}
   461  	}
   462  	return out
   463  }