github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/hashing_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package hashing
    18  
    19  import (
    20  	"math/rand"
    21  	"testing"
    22  
    23  	"github.com/stretchr/testify/assert"
    24  )
    25  
    26  func MakeDistinctIntegers(nvals int) map[int]bool {
    27  	r := rand.New(rand.NewSource(42))
    28  	values := make(map[int]bool)
    29  	for len(values) < nvals {
    30  		values[r.Int()] = true
    31  	}
    32  	return values
    33  }
    34  
    35  func MakeSequentialIntegers(nvals int) map[int]bool {
    36  	values := make(map[int]bool)
    37  	for i := 0; i < nvals; i++ {
    38  		values[i] = true
    39  	}
    40  	return values
    41  }
    42  
    43  func MakeDistinctStrings(nvals int) map[string]bool {
    44  	values := make(map[string]bool)
    45  
    46  	r := rand.New(rand.NewSource(42))
    47  
    48  	max := 'z'
    49  	min := '0'
    50  	for len(values) < nvals {
    51  		data := make([]byte, r.Intn(24))
    52  		for idx := range data {
    53  			data[idx] = byte(r.Intn(int(max-min+1)) + int(min))
    54  		}
    55  		values[string(data)] = true
    56  	}
    57  	return values
    58  }
    59  
    60  func TestHashingQualityInt(t *testing.T) {
    61  	const nvalues = 10000
    62  
    63  	tests := []struct {
    64  		name    string
    65  		values  map[int]bool
    66  		quality float64
    67  	}{
    68  		{"distinct", MakeDistinctIntegers(nvalues), 0.96},
    69  		{"sequential", MakeSequentialIntegers(nvalues), 0.96},
    70  	}
    71  
    72  	for _, tt := range tests {
    73  		t.Run(tt.name, func(t *testing.T) {
    74  			hashes := make(map[uint64]bool)
    75  			for k := range tt.values {
    76  				hashes[hashInt(uint64(k), 0)] = true
    77  				hashes[hashInt(uint64(k), 1)] = true
    78  			}
    79  			assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values)))
    80  		})
    81  	}
    82  }
    83  
    84  func TestHashingBoundsStrings(t *testing.T) {
    85  	sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21}
    86  	for _, s := range sizes {
    87  		str := make([]byte, s)
    88  		for idx := range str {
    89  			str[idx] = uint8(idx)
    90  		}
    91  
    92  		h := hash(str, 1)
    93  		diff := 0
    94  		for i := 0; i < 120; i++ {
    95  			str[len(str)-1] = uint8(i)
    96  			if hash(str, 1) != h {
    97  				diff++
    98  			}
    99  		}
   100  		assert.GreaterOrEqual(t, diff, 118)
   101  	}
   102  }
   103  
   104  func TestHashingQualityString(t *testing.T) {
   105  	const nvalues = 10000
   106  	values := MakeDistinctStrings(nvalues)
   107  
   108  	hashes := make(map[uint64]bool)
   109  	for k := range values {
   110  		hashes[hashString(k, 0)] = true
   111  		hashes[hashString(k, 1)] = true
   112  	}
   113  	assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values)))
   114  }