github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/aggregator/numerical.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package aggregator
    13  
    14  import (
    15  	"math"
    16  	"sort"
    17  
    18  	"github.com/pkg/errors"
    19  	"github.com/weaviate/weaviate/adapters/repos/db/inverted"
    20  	"github.com/weaviate/weaviate/entities/aggregation"
    21  )
    22  
    23  func addNumericalAggregations(prop *aggregation.Property,
    24  	aggs []aggregation.Aggregator, agg *numericalAggregator,
    25  ) {
    26  	if prop.NumericalAggregations == nil {
    27  		prop.NumericalAggregations = map[string]interface{}{}
    28  	}
    29  	agg.buildPairsFromCounts()
    30  
    31  	// if there are no elements to aggregate over because a filter does not match anything, calculating mean etc. makes
    32  	// no sense. Non-existent entries evaluate to nil with an interface{} map
    33  	if agg.count == 0 {
    34  		for _, entry := range aggs {
    35  			if entry == aggregation.CountAggregator {
    36  				prop.NumericalAggregations["count"] = float64(agg.count)
    37  				break
    38  			}
    39  		}
    40  		return
    41  	}
    42  
    43  	// when combining the results from different shards, we need the raw numbers to recompute the mode, mean and median.
    44  	// Therefore we add a reference later which needs to be cleared out before returning the results to a user
    45  loop:
    46  	for _, aProp := range aggs {
    47  		switch aProp {
    48  		case aggregation.ModeAggregator, aggregation.MedianAggregator, aggregation.MeanAggregator:
    49  			prop.NumericalAggregations["_numericalAggregator"] = agg
    50  			break loop
    51  		}
    52  	}
    53  
    54  	for _, aProp := range aggs {
    55  		switch aProp {
    56  		case aggregation.MeanAggregator:
    57  			prop.NumericalAggregations[aProp.String()] = agg.Mean()
    58  		case aggregation.MinimumAggregator:
    59  			prop.NumericalAggregations[aProp.String()] = agg.Min()
    60  		case aggregation.MaximumAggregator:
    61  			prop.NumericalAggregations[aProp.String()] = agg.Max()
    62  		case aggregation.MedianAggregator:
    63  			prop.NumericalAggregations[aProp.String()] = agg.Median()
    64  		case aggregation.ModeAggregator:
    65  			prop.NumericalAggregations[aProp.String()] = agg.Mode()
    66  		case aggregation.SumAggregator:
    67  			prop.NumericalAggregations[aProp.String()] = agg.Sum()
    68  		case aggregation.CountAggregator:
    69  			prop.NumericalAggregations[aProp.String()] = agg.Count()
    70  		default:
    71  			continue
    72  		}
    73  	}
    74  }
    75  
    76  func newNumericalAggregator() *numericalAggregator {
    77  	return &numericalAggregator{
    78  		min:          math.MaxFloat64,
    79  		max:          -math.MaxFloat64,
    80  		valueCounter: map[float64]uint64{},
    81  		pairs:        make([]floatCountPair, 0),
    82  	}
    83  }
    84  
    85  type numericalAggregator struct {
    86  	count        uint64
    87  	min          float64
    88  	max          float64
    89  	sum          float64
    90  	maxCount     uint64
    91  	mode         float64
    92  	pairs        []floatCountPair   // for row-based median calculation
    93  	valueCounter map[float64]uint64 // for individual median calculation
    94  }
    95  
    96  type floatCountPair struct {
    97  	value float64
    98  	count uint64
    99  }
   100  
   101  func (a *numericalAggregator) AddFloat64(value float64) error {
   102  	return a.AddNumberRow(value, 1)
   103  }
   104  
   105  // turns the value counter into a sorted list, as well as identifying the mode. Must be called before calling median etc
   106  func (a *numericalAggregator) buildPairsFromCounts() {
   107  	a.pairs = a.pairs[:0] // clear out old values in case this function called more than once
   108  	a.pairs = append(a.pairs, make([]floatCountPair, 0, len(a.valueCounter))...)
   109  
   110  	for value, count := range a.valueCounter {
   111  		// get one with higher count or lower value if counts are equal
   112  		if count > a.maxCount || (count == a.maxCount && value < a.mode) {
   113  			a.maxCount = count
   114  			a.mode = value
   115  		}
   116  		a.pairs = append(a.pairs, floatCountPair{value: value, count: count})
   117  	}
   118  
   119  	sort.Slice(a.pairs, func(x, y int) bool {
   120  		return a.pairs[x].value < a.pairs[y].value
   121  	})
   122  }
   123  
   124  func (a *numericalAggregator) AddFloat64Row(number []byte,
   125  	count uint64,
   126  ) error {
   127  	numberParsed, err := inverted.ParseLexicographicallySortableFloat64(number)
   128  	if err != nil {
   129  		return errors.Wrap(err, "read float64")
   130  	}
   131  
   132  	return a.AddNumberRow(numberParsed, count)
   133  }
   134  
   135  func (a *numericalAggregator) AddInt64Row(number []byte, count uint64) error {
   136  	numberParsed, err := inverted.ParseLexicographicallySortableInt64(number)
   137  	if err != nil {
   138  		return errors.Wrap(err, "read int64")
   139  	}
   140  
   141  	return a.AddNumberRow(float64(numberParsed), count)
   142  }
   143  
   144  func (a *numericalAggregator) AddNumberRow(number float64, count uint64) error {
   145  	if count == 0 {
   146  		// skip
   147  		return nil
   148  	}
   149  
   150  	a.count += count
   151  	a.sum += number * float64(count)
   152  	if number < a.min {
   153  		a.min = number
   154  	}
   155  	if number > a.max {
   156  		a.max = number
   157  	}
   158  
   159  	currentCount := a.valueCounter[number]
   160  	currentCount += count
   161  	a.valueCounter[number] = currentCount
   162  
   163  	return nil
   164  }
   165  
   166  func (a *numericalAggregator) Mean() float64 {
   167  	if a.count == 0 {
   168  		return 0
   169  	}
   170  	return a.sum / float64(a.count)
   171  }
   172  
   173  func (a *numericalAggregator) Max() float64 {
   174  	return a.max
   175  }
   176  
   177  func (a *numericalAggregator) Min() float64 {
   178  	return a.min
   179  }
   180  
   181  func (a *numericalAggregator) Sum() float64 {
   182  	return a.sum
   183  }
   184  
   185  func (a *numericalAggregator) Count() float64 {
   186  	return float64(a.count)
   187  }
   188  
   189  // Mode does not require preparation if build from rows, but requires a call of
   190  // buildPairsFromCounts() if it was built using individual objects
   191  func (a *numericalAggregator) Mode() float64 {
   192  	return a.mode
   193  }
   194  
   195  // Median does not require preparation if build from rows, but requires a call of
   196  // buildPairsFromCounts() if it was built using individual objects. The call will panic
   197  // if called without adding at least one element or without calling buildPairsFromCounts()
   198  //
   199  // since the pairs are read from an inverted index, which is in turn
   200  // lexicographically sorted, we know that our pairs must also be sorted
   201  //
   202  // There are two cases:
   203  // a) There is an uneven number of elements, then the median element is at index N/2
   204  // b) There is an even number of elements, then the median element is (elem_(N/2) + elem_(N/2+1))/2.
   205  //
   206  //	with two sub-cases:
   207  //	  b1) element N/2 and N/2 + 1 are within the same pair, then the median is the value of this pair
   208  //	  b2) element N/2 and N/2 are part of different pairs, then the average of these pairs is the median and the
   209  //	      median value is not part of the collection itself
   210  func (a *numericalAggregator) Median() float64 {
   211  	middleIndex := a.count / 2
   212  	count := uint64(0)
   213  	for index, pair := range a.pairs {
   214  		count += pair.count
   215  		if a.count%2 == 1 && count > middleIndex {
   216  			return pair.value // case a)
   217  		} else if a.count%2 == 0 {
   218  			if count == middleIndex {
   219  				return (pair.value + a.pairs[index+1].value) / 2 // case b2)
   220  			} else if count > middleIndex {
   221  				return pair.value // case b1)
   222  			}
   223  		}
   224  	}
   225  	panic("Couldn't determine median. This should never happen. Did you add values and call buildRows before?")
   226  }