github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/aggregator/text.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package aggregator
    13  
    14  import (
    15  	"sort"
    16  
    17  	"github.com/pkg/errors"
    18  	"github.com/weaviate/weaviate/entities/aggregation"
    19  	"github.com/weaviate/weaviate/entities/schema"
    20  	"github.com/weaviate/weaviate/entities/storobj"
    21  )
    22  
    23  func extractLimitFromTopOccs(aggs []aggregation.Aggregator) int {
    24  	for _, agg := range aggs {
    25  		if agg.Type == aggregation.TopOccurrencesType && agg.Limit != nil {
    26  			return *agg.Limit
    27  		}
    28  	}
    29  
    30  	// we couldn't extract a limit, default to something reasonable
    31  	return 5
    32  }
    33  
    34  func newTextAggregator(limit int) *textAggregator {
    35  	return &textAggregator{itemCounter: map[string]int{}, max: limit}
    36  }
    37  
    38  type textAggregator struct {
    39  	max   int
    40  	count uint64
    41  
    42  	itemCounter map[string]int
    43  
    44  	// always keep sorted, so we can cut off the last elem, when it grows larger
    45  	// than max
    46  	topPairs []aggregation.TextOccurrence
    47  }
    48  
    49  func (a *Aggregator) parseAndAddTextRow(agg *textAggregator,
    50  	v []byte, propName schema.PropertyName,
    51  ) error {
    52  	items, ok, err := storobj.ParseAndExtractTextProp(v, propName.String())
    53  	if err != nil {
    54  		return errors.Wrap(err, "parse and extract prop")
    55  	}
    56  
    57  	if !ok {
    58  		return nil
    59  	}
    60  
    61  	for i := range items {
    62  		if err := agg.AddText(items[i]); err != nil {
    63  			return err
    64  		}
    65  	}
    66  	return nil
    67  }
    68  
    69  func (a *textAggregator) AddText(value string) error {
    70  	a.count++
    71  
    72  	itemCount := a.itemCounter[value]
    73  	itemCount++
    74  	a.itemCounter[value] = itemCount
    75  	return nil
    76  }
    77  
    78  func (a *textAggregator) insertOrdered(elem aggregation.TextOccurrence) {
    79  	if len(a.topPairs) == 0 {
    80  		a.topPairs = []aggregation.TextOccurrence{elem}
    81  		return
    82  	}
    83  
    84  	added := false
    85  	for i, pair := range a.topPairs {
    86  		if pair.Occurs > elem.Occurs {
    87  			continue
    88  		}
    89  		// if number of occurrences is the same,
    90  		// skip if string is after one in topPairs
    91  		if pair.Occurs == elem.Occurs && pair.Value < elem.Value {
    92  			continue
    93  		}
    94  
    95  		// we have found the first one that's smaller so me must insert before i
    96  		a.topPairs = append(
    97  			a.topPairs[:i], append(
    98  				[]aggregation.TextOccurrence{elem},
    99  				a.topPairs[i:]...,
   100  			)...,
   101  		)
   102  
   103  		added = true
   104  		break
   105  	}
   106  
   107  	if len(a.topPairs) > a.max {
   108  		a.topPairs = a.topPairs[:len(a.topPairs)-1]
   109  	}
   110  
   111  	if !added && len(a.topPairs) < a.max {
   112  		a.topPairs = append(a.topPairs, elem)
   113  	}
   114  }
   115  
   116  func (a *textAggregator) Res() aggregation.Text {
   117  	out := aggregation.Text{}
   118  	if a.count == 0 {
   119  		return out
   120  	}
   121  
   122  	for value, count := range a.itemCounter {
   123  		a.insertOrdered(aggregation.TextOccurrence{
   124  			Value:  value,
   125  			Occurs: count,
   126  		})
   127  	}
   128  
   129  	out.Items = a.topPairs
   130  	sort.SliceStable(out.Items, func(a, b int) bool {
   131  		countA := out.Items[a].Occurs
   132  		countB := out.Items[b].Occurs
   133  
   134  		if countA != countB {
   135  			return countA > countB
   136  		}
   137  
   138  		valueA := out.Items[a].Value
   139  		valueB := out.Items[b].Value
   140  		if len(valueA) == 0 || len(valueB) == 0 {
   141  			return false // order doesn't matter in this case, just prevent a panic
   142  		}
   143  
   144  		return valueA[0] < valueB[0]
   145  	})
   146  
   147  	out.Count = int(a.count)
   148  	return out
   149  }