github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/aggregator/date.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package aggregator
    13  
    14  import (
    15  	"fmt"
    16  	"math"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/pkg/errors"
    21  	"github.com/weaviate/weaviate/adapters/repos/db/inverted"
    22  	"github.com/weaviate/weaviate/entities/aggregation"
    23  )
    24  
    25  func addDateAggregations(prop *aggregation.Property,
    26  	aggs []aggregation.Aggregator, agg *dateAggregator,
    27  ) {
    28  	if prop.DateAggregations == nil {
    29  		prop.DateAggregations = map[string]interface{}{}
    30  	}
    31  	agg.buildPairsFromCounts()
    32  
    33  	// if there are no elements to aggregate over because a filter does not match anything, calculating median etc. makes
    34  	// no sense. Non-existent entries evaluate to nil with an interface{} map
    35  	if agg.count == 0 {
    36  		for _, entry := range aggs {
    37  			if entry == aggregation.CountAggregator {
    38  				prop.DateAggregations["count"] = int64(agg.count)
    39  				break
    40  			}
    41  		}
    42  		return
    43  	}
    44  
    45  	// when combining the results from different shards, we need the raw dates to recompute the mode and median.
    46  	// Therefore we add a reference later which needs to be cleared out before returning the results to a user
    47  	for _, aProp := range aggs {
    48  		switch aProp {
    49  		case aggregation.ModeAggregator, aggregation.MedianAggregator:
    50  			prop.DateAggregations["_dateAggregator"] = agg
    51  		}
    52  	}
    53  
    54  	for _, aProp := range aggs {
    55  		switch aProp {
    56  		case aggregation.MinimumAggregator:
    57  			prop.DateAggregations[aProp.String()] = agg.Min()
    58  		case aggregation.MaximumAggregator:
    59  			prop.DateAggregations[aProp.String()] = agg.Max()
    60  		case aggregation.ModeAggregator:
    61  			prop.DateAggregations[aProp.String()] = agg.Mode()
    62  		case aggregation.CountAggregator:
    63  			prop.DateAggregations[aProp.String()] = agg.Count()
    64  		case aggregation.MedianAggregator:
    65  			prop.DateAggregations[aProp.String()] = agg.Median()
    66  
    67  		default:
    68  			continue
    69  		}
    70  	}
    71  }
    72  
    73  type dateAggregator struct {
    74  	count        uint64
    75  	maxCount     uint64
    76  	min          timestamp
    77  	max          timestamp
    78  	mode         timestamp
    79  	pairs        []timestampCountPair // for row-based median calculation
    80  	valueCounter map[timestamp]uint64 // for individual median calculation
    81  }
    82  
    83  func newDateAggregator() *dateAggregator {
    84  	return &dateAggregator{
    85  		min:          timestamp{epochNano: math.MaxInt64},
    86  		valueCounter: map[timestamp]uint64{},
    87  		pairs:        make([]timestampCountPair, 0),
    88  	}
    89  }
    90  
    91  // timestamp allows us to contain multiple representations of a datetime
    92  // the nanosecs value is needed for the numerical comparisons, and the
    93  // string value is what the user expects to see
    94  type timestamp struct {
    95  	epochNano int64
    96  	rfc3339   string
    97  }
    98  
    99  func newTimestamp(epochNano int64) timestamp {
   100  	return timestamp{
   101  		epochNano: epochNano,
   102  		rfc3339:   time.Unix(0, epochNano).UTC().Format(time.RFC3339Nano),
   103  	}
   104  }
   105  
   106  type timestampCountPair struct {
   107  	value timestamp
   108  	count uint64
   109  }
   110  
   111  func (a *dateAggregator) AddTimestamp(rfc3339 string) error {
   112  	t, err := time.Parse(time.RFC3339Nano, rfc3339)
   113  	if err != nil {
   114  		return fmt.Errorf("failed to parse timestamp: %s", err)
   115  	}
   116  
   117  	ts := timestamp{
   118  		epochNano: t.UnixNano(),
   119  		rfc3339:   rfc3339,
   120  	}
   121  	return a.addRow(ts, 1)
   122  }
   123  
   124  func (a *dateAggregator) AddTimestampRow(b []byte, count uint64) error {
   125  	nsec, err := inverted.ParseLexicographicallySortableInt64(b)
   126  	if err != nil {
   127  		return errors.Wrap(err, "read int64")
   128  	}
   129  
   130  	ts := newTimestamp(nsec)
   131  
   132  	return a.addRow(ts, count)
   133  }
   134  
   135  func (a *dateAggregator) addRow(ts timestamp, count uint64) error {
   136  	if count == 0 {
   137  		// skip
   138  		return nil
   139  	}
   140  
   141  	a.count += count
   142  	if ts.epochNano < a.min.epochNano {
   143  		a.min = ts
   144  	}
   145  	if ts.epochNano > a.max.epochNano {
   146  		a.max = ts
   147  	}
   148  
   149  	currentCount := a.valueCounter[ts]
   150  	currentCount += count
   151  	a.valueCounter[ts] = currentCount
   152  
   153  	return nil
   154  }
   155  
   156  func (a *dateAggregator) Max() string {
   157  	return a.max.rfc3339
   158  }
   159  
   160  func (a *dateAggregator) Min() string {
   161  	return a.min.rfc3339
   162  }
   163  
   164  // Mode does not require preparation if build from rows, but requires a call of
   165  // buildPairsFromCounts() if it was built using individual objects
   166  func (a *dateAggregator) Mode() string {
   167  	return a.mode.rfc3339
   168  }
   169  
   170  func (a *dateAggregator) Count() int64 {
   171  	return int64(a.count)
   172  }
   173  
   174  // Median does not require preparation if build from rows, but requires a call of
   175  // buildPairsFromCounts() if it was built using individual objects
   176  //
   177  // Check the numericalAggregator.Median() for details about the calculation
   178  func (a *dateAggregator) Median() string {
   179  	middleIndex := a.count / 2
   180  	count := uint64(0)
   181  	for index, pair := range a.pairs {
   182  		count += pair.count
   183  		if a.count%2 == 1 && count > middleIndex {
   184  			return pair.value.rfc3339 // case a)
   185  		} else if a.count%2 == 0 {
   186  			if count == middleIndex {
   187  				MedianEpochNano := pair.value.epochNano + (a.pairs[index+1].value.epochNano-pair.value.epochNano)/2
   188  				return time.Unix(0, MedianEpochNano).UTC().Format(time.RFC3339Nano) // case b2)
   189  			} else if count > middleIndex {
   190  				return pair.value.rfc3339 // case b1)
   191  			}
   192  		}
   193  	}
   194  	panic("Couldn't determine median. This should never happen. Did you add values and call buildRows before?")
   195  }
   196  
   197  // turns the value counter into a sorted list, as well as identifying the mode
   198  func (a *dateAggregator) buildPairsFromCounts() {
   199  	a.pairs = a.pairs[:0] // clear out old values in case this function called more than once
   200  	for value, count := range a.valueCounter {
   201  		if count > a.maxCount {
   202  			a.maxCount = count
   203  			a.mode = value
   204  		}
   205  		a.pairs = append(a.pairs, timestampCountPair{value: value, count: count})
   206  	}
   207  
   208  	sort.Slice(a.pairs, func(x, y int) bool {
   209  		return a.pairs[x].value.epochNano < a.pairs[y].value.epochNano
   210  	})
   211  }