github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/testutils/random.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package testutils contains utilities for generating random data and other
    18  // helpers that are used for testing the various aspects of the parquet library.
    19  package testutils
    20  
    21  import (
    22  	"encoding/binary"
    23  	"math"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v14/arrow"
    28  	"github.com/apache/arrow/go/v14/arrow/array"
    29  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    30  	"github.com/apache/arrow/go/v14/arrow/endian"
    31  	"github.com/apache/arrow/go/v14/arrow/memory"
    32  	"github.com/apache/arrow/go/v14/parquet"
    33  	"github.com/apache/arrow/go/v14/parquet/pqarrow"
    34  
    35  	"golang.org/x/exp/rand"
    36  	"gonum.org/v1/gonum/stat/distuv"
    37  )
    38  
    39  // RandomArrayGenerator is a struct used for constructing Random Arrow arrays
    40  // for use with testing.
    41  type RandomArrayGenerator struct {
    42  	seed     uint64
    43  	extra    uint64
    44  	src      rand.Source
    45  	seedRand *rand.Rand
    46  }
    47  
    48  // NewRandomArrayGenerator constructs a new generator with the requested Seed
    49  func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator {
    50  	src := rand.NewSource(seed)
    51  	return RandomArrayGenerator{seed, 0, src, rand.New(src)}
    52  }
    53  
    54  // GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability
    55  // that a given bit will be zero, with 1-prob being the probability it will be 1. The return value
    56  // is the number of bits that were left unset. The assumption being that buffer is currently
    57  // zero initialized as this function does not clear any bits, it only sets 1s.
    58  func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 {
    59  	count := int64(0)
    60  	r.extra++
    61  
    62  	// bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1,
    63  	// which we'll use to generate the bitmap.
    64  	dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)}
    65  	for i := 0; int64(i) < n; i++ {
    66  		if dist.Rand() != float64(0.0) {
    67  			bitutil.SetBit(buffer, i)
    68  		} else {
    69  			count++
    70  		}
    71  	}
    72  
    73  	return count
    74  }
    75  
    76  // ByteArray creates an array.String for use of creating random ByteArray values for testing parquet
    77  // writing/reading. minLen/maxLen are the min and max length for a given value in the resulting array,
    78  // with nullProb being the probability of a given index being null.
    79  //
    80  // For this generation we only generate ascii values with a min of 'A' and max of 'z'.
    81  func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) arrow.Array {
    82  	if nullProb < 0 || nullProb > 1 {
    83  		panic("null prob must be between 0 and 1")
    84  	}
    85  
    86  	lengths := r.Int32(size, minLen, maxLen, nullProb)
    87  	defer lengths.Release()
    88  
    89  	r.extra++
    90  	dist := rand.New(rand.NewSource(r.seed + r.extra))
    91  	bldr := array.NewStringBuilder(memory.DefaultAllocator)
    92  	defer bldr.Release()
    93  
    94  	strbuf := make([]byte, maxLen)
    95  
    96  	for i := 0; int64(i) < size; i++ {
    97  		if lengths.IsValid(i) {
    98  			l := lengths.Value(i)
    99  			for j := int32(0); j < l; j++ {
   100  				strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
   101  			}
   102  			val := strbuf[:l]
   103  			bldr.Append(*(*string)(unsafe.Pointer(&val)))
   104  		} else {
   105  			bldr.AppendNull()
   106  		}
   107  	}
   108  
   109  	return bldr.NewArray()
   110  }
   111  
   112  // Uint8 generates a random array.Uint8 of the requested size whose values are between min and max
   113  // with prob as the probability that a given index will be null.
   114  func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) arrow.Array {
   115  	buffers := make([]*memory.Buffer, 2)
   116  	nullCount := int64(0)
   117  
   118  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   119  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   120  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob)
   121  
   122  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   123  	buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes)))
   124  
   125  	r.extra++
   126  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   127  	out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes())
   128  	for i := int64(0); i < size; i++ {
   129  		out[i] = uint8(dist.Intn(int(max-min+1))) + min
   130  	}
   131  
   132  	return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0))
   133  }
   134  
   135  // Int32 generates a random array.Int32 of the given size with each value between min and max,
   136  // and pctNull as the probability that a given index will be null.
   137  func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 {
   138  	buffers := make([]*memory.Buffer, 2)
   139  	nullCount := int64(0)
   140  
   141  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   142  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   143  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
   144  
   145  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   146  	buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size)))
   147  
   148  	r.extra++
   149  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   150  	out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
   151  	for i := int64(0); i < size; i++ {
   152  		out[i] = dist.Int31n(max-min+1) + min
   153  	}
   154  	return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0))
   155  }
   156  
   157  // Int64 generates a random array.Int64 of the given size with each value between min and max,
   158  // and pctNull as the probability that a given index will be null.
   159  func (r *RandomArrayGenerator) Int64(size int64, min, max int64, pctNull float64) *array.Int64 {
   160  	buffers := make([]*memory.Buffer, 2)
   161  	nullCount := int64(0)
   162  
   163  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   164  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   165  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
   166  
   167  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   168  	buffers[1].Resize(arrow.Int64Traits.BytesRequired(int(size)))
   169  
   170  	r.extra++
   171  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   172  	out := arrow.Int64Traits.CastFromBytes(buffers[1].Bytes())
   173  	for i := int64(0); i < size; i++ {
   174  		out[i] = dist.Int63n(max-min+1) + min
   175  	}
   176  	return array.NewInt64Data(array.NewData(arrow.PrimitiveTypes.Int64, int(size), buffers, nil, int(nullCount), 0))
   177  }
   178  
   179  // Float64 generates a random array.Float64 of the requested size with pctNull as the probability
   180  // that a given index will be null.
   181  func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 {
   182  	buffers := make([]*memory.Buffer, 2)
   183  	nullCount := int64(0)
   184  
   185  	buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
   186  	buffers[0].Resize(int(bitutil.BytesForBits(size)))
   187  	nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
   188  
   189  	buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
   190  	buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size)))
   191  
   192  	r.extra++
   193  	dist := rand.New(rand.NewSource(r.seed + r.extra))
   194  	out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes())
   195  	for i := int64(0); i < size; i++ {
   196  		out[i] = dist.NormFloat64()
   197  	}
   198  	return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0))
   199  }
   200  
   201  func (r *RandomArrayGenerator) StringWithRepeats(mem memory.Allocator, sz, unique int64, minLen, maxLen int32, nullProb float64) *array.String {
   202  	if unique > sz {
   203  		panic("invalid config for random StringWithRepeats")
   204  	}
   205  
   206  	// generate a random string dictionary without any nulls
   207  	arr := r.ByteArray(unique, minLen, maxLen, 0)
   208  	defer arr.Release()
   209  	dict := arr.(*array.String)
   210  
   211  	// generate random indices to sample dictionary with
   212  	idArray := r.Int64(sz, 0, unique-1, nullProb)
   213  	defer idArray.Release()
   214  
   215  	bldr := array.NewStringBuilder(mem)
   216  	defer bldr.Release()
   217  
   218  	for i := int64(0); i < sz; i++ {
   219  		if idArray.IsValid(int(i)) {
   220  			idx := idArray.Value(int(i))
   221  			bldr.Append(dict.Value(int(idx)))
   222  		} else {
   223  			bldr.AppendNull()
   224  		}
   225  	}
   226  
   227  	return bldr.NewStringArray()
   228  }
   229  
   230  // FillRandomInt8 populates the slice out with random int8 values between min and max using
   231  // seed as the random see for generation to allow consistency for testing.
   232  func FillRandomInt8(seed uint64, min, max int8, out []int8) {
   233  	r := rand.New(rand.NewSource(seed))
   234  	for idx := range out {
   235  		out[idx] = int8(r.Intn(int(max-min+1))) + min
   236  	}
   237  }
   238  
   239  // FillRandomUint8 populates the slice out with random uint8 values between min and max using
   240  // seed as the random see for generation to allow consistency for testing.
   241  func FillRandomUint8(seed uint64, min, max uint8, out []uint8) {
   242  	r := rand.New(rand.NewSource(seed))
   243  	for idx := range out {
   244  		out[idx] = uint8(r.Intn(int(max-min+1))) + min
   245  	}
   246  }
   247  
   248  // FillRandomInt16 populates the slice out with random int16 values between min and max using
   249  // seed as the random see for generation to allow consistency for testing.
   250  func FillRandomInt16(seed uint64, min, max int16, out []int16) {
   251  	r := rand.New(rand.NewSource(seed))
   252  	for idx := range out {
   253  		out[idx] = int16(r.Intn(int(max-min+1))) + min
   254  	}
   255  }
   256  
   257  // FillRandomUint16 populates the slice out with random uint16 values between min and max using
   258  // seed as the random see for generation to allow consistency for testing.
   259  func FillRandomUint16(seed uint64, min, max uint16, out []uint16) {
   260  	r := rand.New(rand.NewSource(seed))
   261  	for idx := range out {
   262  		out[idx] = uint16(r.Intn(int(max-min+1))) + min
   263  	}
   264  }
   265  
   266  // FillRandomInt32 populates out with random int32 values using seed as the random
   267  // seed for the generator to allow consistency for testing.
   268  func FillRandomInt32(seed uint64, out []int32) {
   269  	r := rand.New(rand.NewSource(seed))
   270  	for idx := range out {
   271  		out[idx] = int32(r.Uint32())
   272  	}
   273  }
   274  
   275  // FillRandomInt32Max populates out with random int32 values between 0 and max using seed as the random
   276  // seed for the generator to allow consistency for testing.
   277  func FillRandomInt32Max(seed uint64, max int32, out []int32) {
   278  	r := rand.New(rand.NewSource(seed))
   279  	for idx := range out {
   280  		out[idx] = r.Int31n(max)
   281  	}
   282  }
   283  
   284  // FillRandomUint32Max populates out with random uint32 values between 0 and max using seed as the random
   285  // seed for the generator to allow consistency for testing.
   286  func FillRandomUint32Max(seed uint64, max uint32, out []uint32) {
   287  	r := rand.New(rand.NewSource(seed))
   288  	for idx := range out {
   289  		out[idx] = uint32(r.Uint64n(uint64(max)))
   290  	}
   291  }
   292  
   293  // FillRandomInt64Max populates out with random int64 values between 0 and max using seed as the random
   294  // seed for the generator to allow consistency for testing.
   295  func FillRandomInt64Max(seed uint64, max int64, out []int64) {
   296  	r := rand.New(rand.NewSource(seed))
   297  	for idx := range out {
   298  		out[idx] = r.Int63n(max)
   299  	}
   300  }
   301  
   302  // FillRandomUint32 populates out with random uint32 values using seed as the random
   303  // seed for the generator to allow consistency for testing.
   304  func FillRandomUint32(seed uint64, out []uint32) {
   305  	r := rand.New(rand.NewSource(seed))
   306  	for idx := range out {
   307  		out[idx] = r.Uint32()
   308  	}
   309  }
   310  
   311  // FillRandomUint64 populates out with random uint64 values using seed as the random
   312  // seed for the generator to allow consistency for testing.
   313  func FillRandomUint64(seed uint64, out []uint64) {
   314  	r := rand.New(rand.NewSource(seed))
   315  	for idx := range out {
   316  		out[idx] = r.Uint64()
   317  	}
   318  }
   319  
   320  // FillRandomUint64Max populates out with random uint64 values between 0 and max using seed as the random
   321  // seed for the generator to allow consistency for testing.
   322  func FillRandomUint64Max(seed uint64, max uint64, out []uint64) {
   323  	r := rand.New(rand.NewSource(seed))
   324  	for idx := range out {
   325  		out[idx] = r.Uint64n(max)
   326  	}
   327  }
   328  
   329  // FillRandomInt64 populates out with random int64 values using seed as the random
   330  // seed for the generator to allow consistency for testing.
   331  func FillRandomInt64(seed uint64, out []int64) {
   332  	r := rand.New(rand.NewSource(seed))
   333  	for idx := range out {
   334  		out[idx] = int64(r.Uint64())
   335  	}
   336  }
   337  
   338  // FillRandomInt96 populates out with random Int96 values using seed as the random
   339  // seed for the generator to allow consistency for testing. It does this by generating
   340  // three random uint32 values for each int96 value.
   341  func FillRandomInt96(seed uint64, out []parquet.Int96) {
   342  	r := rand.New(rand.NewSource(seed))
   343  	for idx := range out {
   344  		*(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32())
   345  		*(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32())
   346  		*(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32())
   347  	}
   348  }
   349  
   350  // randFloat32 creates a random float value with a normal distribution
   351  // to better spread the values out and ensure we do not return any NaN values.
   352  func randFloat32(r *rand.Rand) float32 {
   353  	for {
   354  		f := math.Float32frombits(r.Uint32())
   355  		if !math.IsNaN(float64(f)) {
   356  			return f
   357  		}
   358  	}
   359  }
   360  
   361  // randFloat64 creates a random float value with a normal distribution
   362  // to better spread the values out and ensure we do not return any NaN values.
   363  func randFloat64(r *rand.Rand) float64 {
   364  	for {
   365  		f := math.Float64frombits(r.Uint64())
   366  		if !math.IsNaN(f) {
   367  			return f
   368  		}
   369  	}
   370  }
   371  
   372  // FillRandomFloat32 populates out with random float32 values using seed as the random
   373  // seed for the generator to allow consistency for testing.
   374  func FillRandomFloat32(seed uint64, out []float32) {
   375  	r := rand.New(rand.NewSource(seed))
   376  	for idx := range out {
   377  		out[idx] = randFloat32(r)
   378  	}
   379  }
   380  
   381  // FillRandomFloat64 populates out with random float64 values using seed as the random
   382  // seed for the generator to allow consistency for testing.
   383  func FillRandomFloat64(seed uint64, out []float64) {
   384  	r := rand.New(rand.NewSource(seed))
   385  	for idx := range out {
   386  		out[idx] = randFloat64(r)
   387  	}
   388  }
   389  
   390  // FillRandomByteArray populates out with random ByteArray values with lengths between 2 and 12
   391  // using heap as the actual memory storage used for the bytes generated. Each element of
   392  // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
   393  func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) {
   394  	const (
   395  		maxByteArrayLen = 12
   396  		minByteArrayLen = 2
   397  	)
   398  	RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen)
   399  }
   400  
   401  // FillRandomFixedByteArray populates out with random FixedLenByteArray values with of a length equal to size
   402  // using heap as the actual memory storage used for the bytes generated. Each element of
   403  // out will be a slice of size bytes in heap, and as such heap must outlive the byte array slices.
   404  func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) {
   405  	heap.Resize(len(out) * size)
   406  
   407  	buf := heap.Bytes()
   408  	r := rand.New(rand.NewSource(seed))
   409  	for idx := range out {
   410  		r.Read(buf[:size])
   411  		out[idx] = buf[:size]
   412  		buf = buf[size:]
   413  	}
   414  }
   415  
   416  // FillRandomBooleans populates out with random bools with the probability p of being false using
   417  // seed as the random seed to the generator in order to allow consistency for testing. This uses
   418  // a Bernoulli distribution of values.
   419  func FillRandomBooleans(p float64, seed uint64, out []bool) {
   420  	dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)}
   421  	for idx := range out {
   422  		out[idx] = dist.Rand() != float64(0.0)
   423  	}
   424  }
   425  
   426  // fillRandomIsValid populates out with random bools with the probability pctNull of being false using
   427  // seed as the random seed to the generator in order to allow consistency for testing. This uses
   428  // the default Golang random generator distribution of float64 values between 0 and 1 comparing against
   429  // pctNull. If the random value is > pctNull, it is true.
   430  func fillRandomIsValid(seed uint64, pctNull float64, out []bool) {
   431  	r := rand.New(rand.NewSource(seed))
   432  	for idx := range out {
   433  		out[idx] = r.Float64() > pctNull
   434  	}
   435  }
   436  
   437  // InitValues is a convenience function for generating a slice of random values based on the type.
   438  // If the type is parquet.ByteArray or parquet.FixedLenByteArray, heap must not be null.
   439  //
   440  // The default values are:
   441  //  []bool uses the current time as the seed with only values of 1 being false, for use
   442  //   of creating validity boolean slices.
   443  //  all other types use 0 as the seed
   444  //  a []parquet.ByteArray is populated with lengths between 2 and 12
   445  //  a []parquet.FixedLenByteArray is populated with fixed size random byte arrays of length 12.
   446  func InitValues(values interface{}, heap *memory.Buffer) {
   447  	switch arr := values.(type) {
   448  	case []bool:
   449  		fillRandomIsValid(uint64(time.Now().Unix()), 1.0, arr)
   450  	case []int32:
   451  		FillRandomInt32(0, arr)
   452  	case []int64:
   453  		FillRandomInt64(0, arr)
   454  	case []float32:
   455  		FillRandomFloat32(0, arr)
   456  	case []float64:
   457  		FillRandomFloat64(0, arr)
   458  	case []parquet.Int96:
   459  		FillRandomInt96(0, arr)
   460  	case []parquet.ByteArray:
   461  		FillRandomByteArray(0, arr, heap)
   462  	case []parquet.FixedLenByteArray:
   463  		FillRandomFixedByteArray(0, arr, heap, 12)
   464  	}
   465  }
   466  
   467  // RandomByteArray populates out with random ByteArray values with lengths between minlen and maxlen
   468  // using heap as the actual memory storage used for the bytes generated. Each element of
   469  // out will be some slice of the bytes in heap, and as such heap must outlive the byte array slices.
   470  func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) {
   471  	heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes))
   472  
   473  	buf := heap.Bytes()
   474  	r := rand.New(rand.NewSource(seed))
   475  	for idx := range out {
   476  		length := r.Intn(maxlen-minlen+1) + minlen
   477  		r.Read(buf[:length])
   478  		out[idx] = buf[:length]
   479  
   480  		buf = buf[length:]
   481  	}
   482  }
   483  
   484  // RandomDecimals generates n random decimal values with precision determining the byte width
   485  // for the values and seed as the random generator seed to allow consistency for testing. The
   486  // resulting values will be either 32 bytes or 16 bytes each depending on the precision.
   487  func RandomDecimals(n int64, seed uint64, precision int32) []byte {
   488  	r := rand.New(rand.NewSource(seed))
   489  	nreqBytes := pqarrow.DecimalSize(precision)
   490  	byteWidth := 32
   491  	if precision <= 38 {
   492  		byteWidth = 16
   493  	}
   494  
   495  	out := make([]byte, int(int64(byteWidth)*n))
   496  	for i := int64(0); i < n; i++ {
   497  		start := int(i) * byteWidth
   498  		r.Read(out[start : start+int(nreqBytes)])
   499  		// sign extend if the sign bit is set for the last generated byte
   500  		// 0b10000000 == 0x80 == 128
   501  		if out[start+int(nreqBytes)-1]&byte(0x80) != 0 {
   502  			memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF)
   503  		}
   504  
   505  		// byte swap for big endian
   506  		if endian.IsBigEndian {
   507  			for j := 0; j+8 <= byteWidth; j += 8 {
   508  				v := binary.LittleEndian.Uint64(out[start+j : start+j+8])
   509  				binary.BigEndian.PutUint64(out[start+j:start+j+8], v)
   510  			}
   511  		}
   512  	}
   513  	return out
   514  }