github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/compressionhelpers/tile_encoder.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package compressionhelpers
    13  
    14  import (
    15  	"encoding/binary"
    16  	"math"
    17  	"sync/atomic"
    18  
    19  	"gonum.org/v1/gonum/stat/distuv"
    20  )
    21  
    22  type distribution interface {
    23  	Transform(x float64) float64
    24  	CDF(x float64) float64
    25  	Quantile(x float64) float64
    26  }
    27  
    28  type logNormalDistribution struct {
    29  	dist *distuv.LogNormal
    30  }
    31  
    32  func newLogNormalDistribution(mean float64, std float64) distribution {
    33  	return &logNormalDistribution{
    34  		dist: &distuv.LogNormal{
    35  			Mu:    mean,
    36  			Sigma: std,
    37  		},
    38  	}
    39  }
    40  
    41  func (d *logNormalDistribution) Transform(x float64) float64 {
    42  	if x > 0 {
    43  		return math.Log(x)
    44  	}
    45  	return 0
    46  }
    47  
    48  func (d *logNormalDistribution) CDF(x float64) float64 {
    49  	return d.dist.CDF(x)
    50  }
    51  
    52  func (d *logNormalDistribution) Quantile(x float64) float64 {
    53  	return d.dist.Quantile(x)
    54  }
    55  
    56  type normalDistribution struct {
    57  	dist *distuv.Normal
    58  }
    59  
    60  func newNormalDistribution(mean float64, std float64) distribution {
    61  	return &normalDistribution{
    62  		dist: &distuv.Normal{
    63  			Mu:    mean,
    64  			Sigma: std,
    65  		},
    66  	}
    67  }
    68  
    69  func (d *normalDistribution) Transform(x float64) float64 {
    70  	return x
    71  }
    72  
    73  func (d *normalDistribution) CDF(x float64) float64 {
    74  	return d.dist.CDF(x)
    75  }
    76  
    77  func (d *normalDistribution) Quantile(x float64) float64 {
    78  	return d.dist.Quantile(x)
    79  }
    80  
    81  type Centroid struct {
    82  	Center     []float32
    83  	Calculated atomic.Bool
    84  }
    85  
    86  type EncoderDistribution byte
    87  
    88  const (
    89  	NormalEncoderDistribution    EncoderDistribution = 0
    90  	LogNormalEncoderDistribution EncoderDistribution = 1
    91  )
    92  
    93  type TileEncoder struct {
    94  	bins                float64
    95  	mean                float64
    96  	stdDev              float64
    97  	size                float64
    98  	s1                  float64
    99  	s2                  float64
   100  	segment             int
   101  	centroids           []Centroid
   102  	encoderDistribution EncoderDistribution
   103  	distribution        distribution
   104  }
   105  
   106  func NewTileEncoder(bits int, segment int, encoderDistribution EncoderDistribution) *TileEncoder {
   107  	centroids := math.Pow(2, float64(bits))
   108  	te := &TileEncoder{
   109  		bins:                centroids,
   110  		mean:                0,
   111  		stdDev:              0,
   112  		size:                0,
   113  		s1:                  0,
   114  		s2:                  0,
   115  		segment:             segment,
   116  		centroids:           make([]Centroid, int(centroids)),
   117  		encoderDistribution: encoderDistribution,
   118  	}
   119  	te.setEncoderDistribution()
   120  	return te
   121  }
   122  
   123  func RestoreTileEncoder(bins float64, mean float64, stdDev float64, size float64, s1 float64, s2 float64, segment uint16, encoderDistribution byte) *TileEncoder {
   124  	te := &TileEncoder{
   125  		bins:                bins,
   126  		mean:                mean,
   127  		stdDev:              stdDev,
   128  		size:                size,
   129  		s1:                  s1,
   130  		s2:                  s2,
   131  		segment:             int(segment),
   132  		encoderDistribution: EncoderDistribution(encoderDistribution),
   133  	}
   134  	te.setEncoderDistribution()
   135  	return te
   136  }
   137  
   138  func (te *TileEncoder) ExposeDataForRestore() []byte {
   139  	buffer := make([]byte, 51)
   140  	binary.LittleEndian.PutUint64(buffer[0:8], math.Float64bits(te.bins))
   141  	binary.LittleEndian.PutUint64(buffer[8:16], math.Float64bits(te.mean))
   142  	binary.LittleEndian.PutUint64(buffer[16:24], math.Float64bits(te.stdDev))
   143  	binary.LittleEndian.PutUint64(buffer[24:32], math.Float64bits(te.size))
   144  	binary.LittleEndian.PutUint64(buffer[32:40], math.Float64bits(te.s1))
   145  	binary.LittleEndian.PutUint64(buffer[40:48], math.Float64bits(te.s2))
   146  	binary.LittleEndian.PutUint16(buffer[48:50], uint16(te.segment))
   147  	buffer[50] = byte(te.encoderDistribution)
   148  	return buffer
   149  }
   150  
   151  func (te *TileEncoder) Fit(data [][]float32) error {
   152  	te.setEncoderDistribution()
   153  	return nil
   154  }
   155  
   156  func (te *TileEncoder) setEncoderDistribution() {
   157  	switch te.encoderDistribution {
   158  	case LogNormalEncoderDistribution:
   159  		te.distribution = newLogNormalDistribution(te.mean, te.stdDev)
   160  	case NormalEncoderDistribution:
   161  		te.distribution = newNormalDistribution(te.mean, te.stdDev)
   162  	}
   163  }
   164  
   165  func (te *TileEncoder) Add(x []float32) {
   166  	//  calculate mean and stddev iteratively
   167  	x64 := te.distribution.Transform(float64(x[te.segment]))
   168  	te.s1 += x64
   169  	te.s2 += x64 * x64
   170  	te.size++
   171  	te.mean = te.s1 / te.size
   172  	sum := te.s2 + te.size*te.mean*te.mean
   173  	prod := 2 * te.mean * te.s1
   174  	te.stdDev = math.Sqrt((sum - prod) / te.size)
   175  }
   176  
   177  func (te *TileEncoder) Encode(x []float32) byte {
   178  	cdf := te.distribution.CDF(float64(x[te.segment]))
   179  	intPart, _ := math.Modf(cdf * float64(te.bins))
   180  	return byte(intPart)
   181  }
   182  
   183  func (te *TileEncoder) centroid(b byte) []float32 {
   184  	res := make([]float32, 0, 1)
   185  	if b == 0 {
   186  		res = append(res, float32(te.distribution.Quantile(1/te.bins)))
   187  	} else if b == byte(te.bins) {
   188  		res = append(res, float32(te.distribution.Quantile((te.bins-1)/te.bins)))
   189  	} else {
   190  		b64 := float64(b)
   191  		mean := (b64/te.bins + (b64+1)/te.bins) / 2
   192  		res = append(res, float32(te.distribution.Quantile(mean)))
   193  	}
   194  	return res
   195  }
   196  
   197  func (te *TileEncoder) Centroid(b byte) []float32 {
   198  	if te.centroids[b].Calculated.Load() {
   199  		return te.centroids[b].Center
   200  	}
   201  	te.centroids[b].Center = te.centroid(b)
   202  	te.centroids[b].Calculated.Store(true)
   203  	return te.centroids[b].Center
   204  }