github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/analyzer.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"bytes"
    16  	"encoding/binary"
    17  
    18  	"github.com/google/uuid"
    19  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    20  	"github.com/weaviate/weaviate/entities/models"
    21  )
    22  
    23  type IsFallbackToSearchable func() bool
    24  
    25  type Countable struct {
    26  	Data          []byte
    27  	TermFrequency float32
    28  }
    29  
    30  type Property struct {
    31  	Name               string
    32  	Items              []Countable
    33  	Length             int
    34  	HasFilterableIndex bool // roaring set index
    35  	HasSearchableIndex bool // map index (with frequencies)
    36  }
    37  
    38  type NilProperty struct {
    39  	Name                string
    40  	AddToPropertyLength bool
    41  }
    42  
    43  func DedupItems(props []Property) []Property {
    44  	for i := range props {
    45  		seen := map[string]struct{}{}
    46  		items := props[i].Items
    47  
    48  		var key string
    49  		// reverse order to keep latest elements
    50  		for j := len(items) - 1; j >= 0; j-- {
    51  			key = string(items[j].Data)
    52  			if _, ok := seen[key]; ok {
    53  				// remove element already seen
    54  				items = append(items[:j], items[j+1:]...)
    55  			}
    56  			seen[key] = struct{}{}
    57  		}
    58  		props[i].Items = items
    59  	}
    60  	return props
    61  }
    62  
    63  type Analyzer struct {
    64  	isFallbackToSearchable IsFallbackToSearchable
    65  }
    66  
    67  // Text tokenizes given input according to selected tokenization,
    68  // then aggregates duplicates
    69  func (a *Analyzer) Text(tokenization, in string) []Countable {
    70  	return a.TextArray(tokenization, []string{in})
    71  }
    72  
    73  // TextArray tokenizes given input according to selected tokenization,
    74  // then aggregates duplicates
    75  func (a *Analyzer) TextArray(tokenization string, inArr []string) []Countable {
    76  	var terms []string
    77  	for _, in := range inArr {
    78  		terms = append(terms, helpers.Tokenize(tokenization, in)...)
    79  	}
    80  
    81  	counts := map[string]uint64{}
    82  	for _, term := range terms {
    83  		counts[term]++
    84  	}
    85  
    86  	countable := make([]Countable, len(counts))
    87  	i := 0
    88  	for term, count := range counts {
    89  		countable[i] = Countable{
    90  			Data:          []byte(term),
    91  			TermFrequency: float32(count),
    92  		}
    93  		i++
    94  	}
    95  	return countable
    96  }
    97  
    98  // Int requires no analysis, so it's actually just a simple conversion to a
    99  // string-formatted byte slice of the int
   100  func (a *Analyzer) Int(in int64) ([]Countable, error) {
   101  	data, err := LexicographicallySortableInt64(in)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  
   106  	return []Countable{
   107  		{
   108  			Data: data,
   109  		},
   110  	}, nil
   111  }
   112  
   113  // UUID requires no analysis, so it's just dumping the raw binary representation
   114  func (a *Analyzer) UUID(in uuid.UUID) ([]Countable, error) {
   115  	return []Countable{
   116  		{
   117  			Data: in[:],
   118  		},
   119  	}, nil
   120  }
   121  
   122  // UUID array requires no analysis, so it's just dumping the raw binary
   123  // representation of each contained element
   124  func (a *Analyzer) UUIDArray(in []uuid.UUID) ([]Countable, error) {
   125  	out := make([]Countable, len(in))
   126  	for i := range in {
   127  		out[i] = Countable{
   128  			Data: in[i][:],
   129  		}
   130  	}
   131  
   132  	return out, nil
   133  }
   134  
   135  // Int array requires no analysis, so it's actually just a simple conversion to a
   136  // string-formatted byte slice of the int
   137  func (a *Analyzer) IntArray(in []int64) ([]Countable, error) {
   138  	out := make([]Countable, len(in))
   139  	for i := range in {
   140  		data, err := LexicographicallySortableInt64(in[i])
   141  		if err != nil {
   142  			return nil, err
   143  		}
   144  		out[i] = Countable{Data: data}
   145  	}
   146  
   147  	return out, nil
   148  }
   149  
   150  // Float requires no analysis, so it's actually just a simple conversion to a
   151  // lexicographically sortable byte slice.
   152  func (a *Analyzer) Float(in float64) ([]Countable, error) {
   153  	data, err := LexicographicallySortableFloat64(in)
   154  	if err != nil {
   155  		return nil, err
   156  	}
   157  
   158  	return []Countable{
   159  		{
   160  			Data: data,
   161  		},
   162  	}, nil
   163  }
   164  
   165  // Float array requires no analysis, so it's actually just a simple conversion to a
   166  // lexicographically sortable byte slice.
   167  func (a *Analyzer) FloatArray(in []float64) ([]Countable, error) {
   168  	out := make([]Countable, len(in))
   169  	for i := range in {
   170  		data, err := LexicographicallySortableFloat64(in[i])
   171  		if err != nil {
   172  			return nil, err
   173  		}
   174  		out[i] = Countable{Data: data}
   175  	}
   176  
   177  	return out, nil
   178  }
   179  
   180  // BoolArray requires no analysis, so it's actually just a simple conversion to a
   181  // little-endian ordered byte slice
   182  func (a *Analyzer) BoolArray(in []bool) ([]Countable, error) {
   183  	out := make([]Countable, len(in))
   184  	for i := range in {
   185  		b := bytes.NewBuffer(nil)
   186  		err := binary.Write(b, binary.LittleEndian, &in[i])
   187  		if err != nil {
   188  			return nil, err
   189  		}
   190  		out[i] = Countable{Data: b.Bytes()}
   191  	}
   192  
   193  	return out, nil
   194  }
   195  
   196  // Bool requires no analysis, so it's actually just a simple conversion to a
   197  // little-endian ordered byte slice
   198  func (a *Analyzer) Bool(in bool) ([]Countable, error) {
   199  	b := bytes.NewBuffer(nil)
   200  	err := binary.Write(b, binary.LittleEndian, &in)
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  
   205  	return []Countable{
   206  		{
   207  			Data: b.Bytes(),
   208  		},
   209  	}, nil
   210  }
   211  
   212  // RefCount does not index the content of the refs, but only the count with 0
   213  // being an explicitly allowed value as well.
   214  func (a *Analyzer) RefCount(in models.MultipleRef) ([]Countable, error) {
   215  	length := uint64(len(in))
   216  	data, err := LexicographicallySortableUint64(length)
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  
   221  	return []Countable{
   222  		{
   223  			Data: data,
   224  		},
   225  	}, nil
   226  }
   227  
   228  // Ref indexes references as beacon-strings
   229  func (a *Analyzer) Ref(in models.MultipleRef) ([]Countable, error) {
   230  	out := make([]Countable, len(in))
   231  
   232  	for i, ref := range in {
   233  		out[i] = Countable{
   234  			Data: []byte(ref.Beacon),
   235  		}
   236  	}
   237  
   238  	return out, nil
   239  }
   240  
   241  func NewAnalyzer(isFallbackToSearchable IsFallbackToSearchable) *Analyzer {
   242  	if isFallbackToSearchable == nil {
   243  		isFallbackToSearchable = func() bool { return false }
   244  	}
   245  	return &Analyzer{isFallbackToSearchable: isFallbackToSearchable}
   246  }