github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/aggregator/numerical.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package aggregator 13 14 import ( 15 "math" 16 "sort" 17 18 "github.com/pkg/errors" 19 "github.com/weaviate/weaviate/adapters/repos/db/inverted" 20 "github.com/weaviate/weaviate/entities/aggregation" 21 ) 22 23 func addNumericalAggregations(prop *aggregation.Property, 24 aggs []aggregation.Aggregator, agg *numericalAggregator, 25 ) { 26 if prop.NumericalAggregations == nil { 27 prop.NumericalAggregations = map[string]interface{}{} 28 } 29 agg.buildPairsFromCounts() 30 31 // if there are no elements to aggregate over because a filter does not match anything, calculating mean etc. makes 32 // no sense. Non-existent entries evaluate to nil with an interface{} map 33 if agg.count == 0 { 34 for _, entry := range aggs { 35 if entry == aggregation.CountAggregator { 36 prop.NumericalAggregations["count"] = float64(agg.count) 37 break 38 } 39 } 40 return 41 } 42 43 // when combining the results from different shards, we need the raw numbers to recompute the mode, mean and median. 44 // Therefore we add a reference later which needs to be cleared out before returning the results to a user 45 loop: 46 for _, aProp := range aggs { 47 switch aProp { 48 case aggregation.ModeAggregator, aggregation.MedianAggregator, aggregation.MeanAggregator: 49 prop.NumericalAggregations["_numericalAggregator"] = agg 50 break loop 51 } 52 } 53 54 for _, aProp := range aggs { 55 switch aProp { 56 case aggregation.MeanAggregator: 57 prop.NumericalAggregations[aProp.String()] = agg.Mean() 58 case aggregation.MinimumAggregator: 59 prop.NumericalAggregations[aProp.String()] = agg.Min() 60 case aggregation.MaximumAggregator: 61 prop.NumericalAggregations[aProp.String()] = agg.Max() 62 case aggregation.MedianAggregator: 63 prop.NumericalAggregations[aProp.String()] = agg.Median() 64 case aggregation.ModeAggregator: 65 prop.NumericalAggregations[aProp.String()] = agg.Mode() 66 case aggregation.SumAggregator: 67 prop.NumericalAggregations[aProp.String()] = agg.Sum() 68 case aggregation.CountAggregator: 69 prop.NumericalAggregations[aProp.String()] = agg.Count() 70 default: 71 continue 72 } 73 } 74 } 75 76 func newNumericalAggregator() *numericalAggregator { 77 return &numericalAggregator{ 78 min: math.MaxFloat64, 79 max: -math.MaxFloat64, 80 valueCounter: map[float64]uint64{}, 81 pairs: make([]floatCountPair, 0), 82 } 83 } 84 85 type numericalAggregator struct { 86 count uint64 87 min float64 88 max float64 89 sum float64 90 maxCount uint64 91 mode float64 92 pairs []floatCountPair // for row-based median calculation 93 valueCounter map[float64]uint64 // for individual median calculation 94 } 95 96 type floatCountPair struct { 97 value float64 98 count uint64 99 } 100 101 func (a *numericalAggregator) AddFloat64(value float64) error { 102 return a.AddNumberRow(value, 1) 103 } 104 105 // turns the value counter into a sorted list, as well as identifying the mode. Must be called before calling median etc 106 func (a *numericalAggregator) buildPairsFromCounts() { 107 a.pairs = a.pairs[:0] // clear out old values in case this function called more than once 108 a.pairs = append(a.pairs, make([]floatCountPair, 0, len(a.valueCounter))...) 109 110 for value, count := range a.valueCounter { 111 // get one with higher count or lower value if counts are equal 112 if count > a.maxCount || (count == a.maxCount && value < a.mode) { 113 a.maxCount = count 114 a.mode = value 115 } 116 a.pairs = append(a.pairs, floatCountPair{value: value, count: count}) 117 } 118 119 sort.Slice(a.pairs, func(x, y int) bool { 120 return a.pairs[x].value < a.pairs[y].value 121 }) 122 } 123 124 func (a *numericalAggregator) AddFloat64Row(number []byte, 125 count uint64, 126 ) error { 127 numberParsed, err := inverted.ParseLexicographicallySortableFloat64(number) 128 if err != nil { 129 return errors.Wrap(err, "read float64") 130 } 131 132 return a.AddNumberRow(numberParsed, count) 133 } 134 135 func (a *numericalAggregator) AddInt64Row(number []byte, count uint64) error { 136 numberParsed, err := inverted.ParseLexicographicallySortableInt64(number) 137 if err != nil { 138 return errors.Wrap(err, "read int64") 139 } 140 141 return a.AddNumberRow(float64(numberParsed), count) 142 } 143 144 func (a *numericalAggregator) AddNumberRow(number float64, count uint64) error { 145 if count == 0 { 146 // skip 147 return nil 148 } 149 150 a.count += count 151 a.sum += number * float64(count) 152 if number < a.min { 153 a.min = number 154 } 155 if number > a.max { 156 a.max = number 157 } 158 159 currentCount := a.valueCounter[number] 160 currentCount += count 161 a.valueCounter[number] = currentCount 162 163 return nil 164 } 165 166 func (a *numericalAggregator) Mean() float64 { 167 if a.count == 0 { 168 return 0 169 } 170 return a.sum / float64(a.count) 171 } 172 173 func (a *numericalAggregator) Max() float64 { 174 return a.max 175 } 176 177 func (a *numericalAggregator) Min() float64 { 178 return a.min 179 } 180 181 func (a *numericalAggregator) Sum() float64 { 182 return a.sum 183 } 184 185 func (a *numericalAggregator) Count() float64 { 186 return float64(a.count) 187 } 188 189 // Mode does not require preparation if build from rows, but requires a call of 190 // buildPairsFromCounts() if it was built using individual objects 191 func (a *numericalAggregator) Mode() float64 { 192 return a.mode 193 } 194 195 // Median does not require preparation if build from rows, but requires a call of 196 // buildPairsFromCounts() if it was built using individual objects. The call will panic 197 // if called without adding at least one element or without calling buildPairsFromCounts() 198 // 199 // since the pairs are read from an inverted index, which is in turn 200 // lexicographically sorted, we know that our pairs must also be sorted 201 // 202 // There are two cases: 203 // a) There is an uneven number of elements, then the median element is at index N/2 204 // b) There is an even number of elements, then the median element is (elem_(N/2) + elem_(N/2+1))/2. 205 // 206 // with two sub-cases: 207 // b1) element N/2 and N/2 + 1 are within the same pair, then the median is the value of this pair 208 // b2) element N/2 and N/2 are part of different pairs, then the average of these pairs is the median and the 209 // median value is not part of the collection itself 210 func (a *numericalAggregator) Median() float64 { 211 middleIndex := a.count / 2 212 count := uint64(0) 213 for index, pair := range a.pairs { 214 count += pair.count 215 if a.count%2 == 1 && count > middleIndex { 216 return pair.value // case a) 217 } else if a.count%2 == 0 { 218 if count == middleIndex { 219 return (pair.value + a.pairs[index+1].value) / 2 // case b2) 220 } else if count > middleIndex { 221 return pair.value // case b1) 222 } 223 } 224 } 225 panic("Couldn't determine median. This should never happen. Did you add values and call buildRows before?") 226 }