github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/tok.go (about)

     1  /*
     2   * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"encoding/binary"
    21  	"plugin"
    22  	"time"
    23  
    24  	"github.com/golang/glog"
    25  	geom "github.com/twpayne/go-geom"
    26  	"golang.org/x/crypto/blake2b"
    27  
    28  	"github.com/dgraph-io/dgraph/types"
    29  	"github.com/dgraph-io/dgraph/x"
    30  	"github.com/pkg/errors"
    31  )
    32  
    33  // Tokenizer identifiers are unique and can't be reused.
    34  // The range 0x00 - 0x7f is system reserved.
    35  // The range 0x80 - 0xff is for custom tokenizers.
    36  // TODO: use these everywhere where we must ensure a system tokenizer.
    37  const (
    38  	IdentNone     = 0x0
    39  	IdentTerm     = 0x1
    40  	IdentExact    = 0x2
    41  	IdentYear     = 0x4
    42  	IdentMonth    = 0x41
    43  	IdentDay      = 0x42
    44  	IdentHour     = 0x43
    45  	IdentGeo      = 0x5
    46  	IdentInt      = 0x6
    47  	IdentFloat    = 0x7
    48  	IdentFullText = 0x8
    49  	IdentBool     = 0x9
    50  	IdentTrigram  = 0xA
    51  	IdentHash     = 0xB
    52  	IdentCustom   = 0x80
    53  )
    54  
    55  // Tokenizer defines what a tokenizer must provide.
    56  type Tokenizer interface {
    57  
    58  	// Name is name of tokenizer. This should be unique.
    59  	Name() string
    60  
    61  	// Type returns the string representation of the typeID that we care about.
    62  	Type() string
    63  
    64  	// Tokens return tokens for a given value. The tokens shouldn't be encoded
    65  	// with the byte identifier.
    66  	Tokens(interface{}) ([]string, error)
    67  
    68  	// Identifier returns the prefix byte for this token type. This should be
    69  	// unique. The range 0x80 to 0xff (inclusive) is reserved for user-provided
    70  	// custom tokenizers.
    71  	Identifier() byte
    72  
    73  	// IsSortable returns true if the tokenizer can be used for sorting/ordering.
    74  	IsSortable() bool
    75  
    76  	// IsLossy() returns true if we don't store the values directly as index keys
    77  	// during tokenization. If a predicate is tokenized using an IsLossy() tokenizer,
    78  	// then we need to fetch the actual value and compare.
    79  	IsLossy() bool
    80  }
    81  
    82  var tokenizers = make(map[string]Tokenizer)
    83  
    84  func init() {
    85  	registerTokenizer(GeoTokenizer{})
    86  	registerTokenizer(IntTokenizer{})
    87  	registerTokenizer(FloatTokenizer{})
    88  	registerTokenizer(YearTokenizer{})
    89  	registerTokenizer(HourTokenizer{})
    90  	registerTokenizer(MonthTokenizer{})
    91  	registerTokenizer(DayTokenizer{})
    92  	registerTokenizer(ExactTokenizer{})
    93  	registerTokenizer(BoolTokenizer{})
    94  	registerTokenizer(TrigramTokenizer{})
    95  	registerTokenizer(HashTokenizer{})
    96  	registerTokenizer(TermTokenizer{})
    97  	registerTokenizer(FullTextTokenizer{})
    98  	setupBleve()
    99  }
   100  
   101  // BuildTokens tokenizes a value, creating strings that can be used to create
   102  // index keys.
   103  func BuildTokens(val interface{}, t Tokenizer) ([]string, error) {
   104  	tokens, err := t.Tokens(val)
   105  	if err != nil {
   106  		return nil, err
   107  	}
   108  	id := t.Identifier()
   109  	for i := range tokens {
   110  		tokens[i] = encodeToken(tokens[i], id)
   111  	}
   112  	return tokens, nil
   113  }
   114  
   115  // LoadCustomTokenizer reads and loads a custom tokenizer from the given file.
   116  func LoadCustomTokenizer(soFile string) {
   117  	glog.Infof("Loading custom tokenizer from %q", soFile)
   118  	pl, err := plugin.Open(soFile)
   119  	x.Checkf(err, "could not open custom tokenizer plugin file")
   120  	symb, err := pl.Lookup("Tokenizer")
   121  	x.Checkf(err, `could not find symbol "Tokenizer" while loading custom tokenizer: %v`, err)
   122  
   123  	// Let any type assertion panics occur, since they will contain a message
   124  	// telling the user what went wrong. Otherwise it's hard to capture this
   125  	// information to pass on to the user.
   126  	tokenizer := symb.(func() interface{})().(PluginTokenizer)
   127  
   128  	id := tokenizer.Identifier()
   129  	x.AssertTruef(id >= IdentCustom,
   130  		"custom tokenizer identifier byte must be >= 0x80, but was %#x", id)
   131  	registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
   132  }
   133  
   134  // GetTokenizerByID tries to find a tokenizer by id in the registered list.
   135  // Returns the tokenizer and true if found, otherwise nil and false.
   136  func GetTokenizerByID(id byte) (Tokenizer, bool) {
   137  	for _, t := range tokenizers {
   138  		if id == t.Identifier() {
   139  			return t, true
   140  		}
   141  	}
   142  	return nil, false
   143  }
   144  
   145  // GetTokenizer returns tokenizer given unique name.
   146  func GetTokenizer(name string) (Tokenizer, bool) {
   147  	t, found := tokenizers[name]
   148  	return t, found
   149  }
   150  
   151  // GetTokenizers returns a list of tokenizer given a list of unique names.
   152  func GetTokenizers(names []string) ([]Tokenizer, error) {
   153  	var tokenizers []Tokenizer
   154  	for _, name := range names {
   155  		t, found := GetTokenizer(name)
   156  		if !found {
   157  			return nil, errors.Errorf("Invalid tokenizer %s", name)
   158  		}
   159  		tokenizers = append(tokenizers, t)
   160  	}
   161  	return tokenizers, nil
   162  }
   163  
   164  func registerTokenizer(t Tokenizer) {
   165  	_, ok := tokenizers[t.Name()]
   166  	x.AssertTruef(!ok, "Duplicate tokenizer: %s", t.Name())
   167  	_, ok = types.TypeForName(t.Type())
   168  	x.AssertTruef(ok, "Invalid type %q for tokenizer %s", t.Type(), t.Name())
   169  	tokenizers[t.Name()] = t
   170  }
   171  
   172  // GeoTokenizer generates tokens from geo data.
   173  type GeoTokenizer struct{}
   174  
   175  func (t GeoTokenizer) Name() string { return "geo" }
   176  func (t GeoTokenizer) Type() string { return "geo" }
   177  func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) {
   178  	return types.IndexGeoTokens(v.(geom.T))
   179  }
   180  func (t GeoTokenizer) Identifier() byte { return IdentGeo }
   181  func (t GeoTokenizer) IsSortable() bool { return false }
   182  func (t GeoTokenizer) IsLossy() bool    { return true }
   183  
   184  // IntTokenizer generates tokens from integer data.
   185  type IntTokenizer struct{}
   186  
   187  func (t IntTokenizer) Name() string { return "int" }
   188  func (t IntTokenizer) Type() string { return "int" }
   189  func (t IntTokenizer) Tokens(v interface{}) ([]string, error) {
   190  	return []string{encodeInt(v.(int64))}, nil
   191  }
   192  func (t IntTokenizer) Identifier() byte { return IdentInt }
   193  func (t IntTokenizer) IsSortable() bool { return true }
   194  func (t IntTokenizer) IsLossy() bool    { return false }
   195  
   196  // FloatTokenizer generates tokens from floating-point data.
   197  type FloatTokenizer struct{}
   198  
   199  func (t FloatTokenizer) Name() string { return "float" }
   200  func (t FloatTokenizer) Type() string { return "float" }
   201  func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) {
   202  	return []string{encodeInt(int64(v.(float64)))}, nil
   203  }
   204  func (t FloatTokenizer) Identifier() byte { return IdentFloat }
   205  func (t FloatTokenizer) IsSortable() bool { return true }
   206  func (t FloatTokenizer) IsLossy() bool    { return true }
   207  
   208  // YearTokenizer generates year tokens from datetime data.
   209  type YearTokenizer struct{}
   210  
   211  func (t YearTokenizer) Name() string { return "year" }
   212  func (t YearTokenizer) Type() string { return "datetime" }
   213  func (t YearTokenizer) Tokens(v interface{}) ([]string, error) {
   214  	tval := v.(time.Time)
   215  	buf := make([]byte, 2)
   216  	binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year()))
   217  	return []string{string(buf)}, nil
   218  }
   219  func (t YearTokenizer) Identifier() byte { return IdentYear }
   220  func (t YearTokenizer) IsSortable() bool { return true }
   221  func (t YearTokenizer) IsLossy() bool    { return true }
   222  
   223  // MonthTokenizer generates month tokens from datetime data.
   224  type MonthTokenizer struct{}
   225  
   226  func (t MonthTokenizer) Name() string { return "month" }
   227  func (t MonthTokenizer) Type() string { return "datetime" }
   228  func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) {
   229  	tval := v.(time.Time)
   230  	buf := make([]byte, 4)
   231  	binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year()))
   232  	binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month()))
   233  	return []string{string(buf)}, nil
   234  }
   235  func (t MonthTokenizer) Identifier() byte { return IdentMonth }
   236  func (t MonthTokenizer) IsSortable() bool { return true }
   237  func (t MonthTokenizer) IsLossy() bool    { return true }
   238  
   239  // DayTokenizer generates day tokens from datetime data.
   240  type DayTokenizer struct{}
   241  
   242  func (t DayTokenizer) Name() string { return "day" }
   243  func (t DayTokenizer) Type() string { return "datetime" }
   244  func (t DayTokenizer) Tokens(v interface{}) ([]string, error) {
   245  	tval := v.(time.Time)
   246  	buf := make([]byte, 6)
   247  	binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year()))
   248  	binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month()))
   249  	binary.BigEndian.PutUint16(buf[4:6], uint16(tval.UTC().Day()))
   250  	return []string{string(buf)}, nil
   251  }
   252  func (t DayTokenizer) Identifier() byte { return IdentDay }
   253  func (t DayTokenizer) IsSortable() bool { return true }
   254  func (t DayTokenizer) IsLossy() bool    { return true }
   255  
   256  // HourTokenizer generates hour tokens from datetime data.
   257  type HourTokenizer struct{}
   258  
   259  func (t HourTokenizer) Name() string { return "hour" }
   260  func (t HourTokenizer) Type() string { return "datetime" }
   261  func (t HourTokenizer) Tokens(v interface{}) ([]string, error) {
   262  	tval := v.(time.Time)
   263  	buf := make([]byte, 8)
   264  	binary.BigEndian.PutUint16(buf[0:2], uint16(tval.UTC().Year()))
   265  	binary.BigEndian.PutUint16(buf[2:4], uint16(tval.UTC().Month()))
   266  	binary.BigEndian.PutUint16(buf[4:6], uint16(tval.UTC().Day()))
   267  	binary.BigEndian.PutUint16(buf[6:8], uint16(tval.UTC().Hour()))
   268  	return []string{string(buf)}, nil
   269  }
   270  func (t HourTokenizer) Identifier() byte { return IdentHour }
   271  func (t HourTokenizer) IsSortable() bool { return true }
   272  func (t HourTokenizer) IsLossy() bool    { return true }
   273  
   274  // TermTokenizer generates term tokens from string data.
   275  type TermTokenizer struct{}
   276  
   277  func (t TermTokenizer) Name() string { return "term" }
   278  func (t TermTokenizer) Type() string { return "string" }
   279  func (t TermTokenizer) Tokens(v interface{}) ([]string, error) {
   280  	str, ok := v.(string)
   281  	if !ok || str == "" {
   282  		return []string{str}, nil
   283  	}
   284  	tokens := termAnalyzer.Analyze([]byte(str))
   285  	return uniqueTerms(tokens), nil
   286  }
   287  func (t TermTokenizer) Identifier() byte { return IdentTerm }
   288  func (t TermTokenizer) IsSortable() bool { return false }
   289  func (t TermTokenizer) IsLossy() bool    { return true }
   290  
   291  // ExactTokenizer returns the exact string as a token.
   292  type ExactTokenizer struct{}
   293  
   294  func (t ExactTokenizer) Name() string { return "exact" }
   295  func (t ExactTokenizer) Type() string { return "string" }
   296  func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
   297  	if term, ok := v.(string); ok {
   298  		return []string{term}, nil
   299  	}
   300  	return nil, errors.Errorf("Exact indices only supported for string types")
   301  }
   302  func (t ExactTokenizer) Identifier() byte { return IdentExact }
   303  func (t ExactTokenizer) IsSortable() bool { return true }
   304  func (t ExactTokenizer) IsLossy() bool    { return false }
   305  
   306  // FullTextTokenizer generates full-text tokens from string data.
   307  type FullTextTokenizer struct{ lang string }
   308  
   309  func (t FullTextTokenizer) Name() string { return "fulltext" }
   310  func (t FullTextTokenizer) Type() string { return "string" }
   311  func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) {
   312  	str, ok := v.(string)
   313  	if !ok || str == "" {
   314  		return []string{}, nil
   315  	}
   316  	lang := langBase(t.lang)
   317  	// pass 1 - lowercase and normalize input
   318  	tokens := fulltextAnalyzer.Analyze([]byte(str))
   319  	// pass 2 - filter stop words
   320  	tokens = filterStopwords(lang, tokens)
   321  	// pass 3 - filter stems
   322  	tokens = filterStemmers(lang, tokens)
   323  	// finally, return the terms.
   324  	return uniqueTerms(tokens), nil
   325  }
   326  func (t FullTextTokenizer) Identifier() byte { return IdentFullText }
   327  func (t FullTextTokenizer) IsSortable() bool { return false }
   328  func (t FullTextTokenizer) IsLossy() bool    { return true }
   329  
   330  // BoolTokenizer returns tokens from boolean data.
   331  type BoolTokenizer struct{}
   332  
   333  func (t BoolTokenizer) Name() string { return "bool" }
   334  func (t BoolTokenizer) Type() string { return "bool" }
   335  func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) {
   336  	var b int64
   337  	if v.(bool) {
   338  		b = 1
   339  	}
   340  	return []string{encodeInt(b)}, nil
   341  }
   342  func (t BoolTokenizer) Identifier() byte { return IdentBool }
   343  func (t BoolTokenizer) IsSortable() bool { return false }
   344  func (t BoolTokenizer) IsLossy() bool    { return false }
   345  
   346  // TrigramTokenizer returns trigram tokens from string data.
   347  type TrigramTokenizer struct{}
   348  
   349  func (t TrigramTokenizer) Name() string { return "trigram" }
   350  func (t TrigramTokenizer) Type() string { return "string" }
   351  func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) {
   352  	value, ok := v.(string)
   353  	if !ok {
   354  		return nil, errors.Errorf("Trigram indices only supported for string types")
   355  	}
   356  	l := len(value) - 2
   357  	if l > 0 {
   358  		tokens := make([]string, l)
   359  		for i := 0; i < l; i++ {
   360  			tokens[i] = value[i : i+3]
   361  		}
   362  		tokens = x.RemoveDuplicates(tokens)
   363  		return tokens, nil
   364  	}
   365  	return nil, nil
   366  }
   367  func (t TrigramTokenizer) Identifier() byte { return IdentTrigram }
   368  func (t TrigramTokenizer) IsSortable() bool { return false }
   369  func (t TrigramTokenizer) IsLossy() bool    { return true }
   370  
   371  // HashTokenizer returns hash tokens from string data.
   372  type HashTokenizer struct{}
   373  
   374  func (t HashTokenizer) Name() string { return "hash" }
   375  func (t HashTokenizer) Type() string { return "string" }
   376  func (t HashTokenizer) Tokens(v interface{}) ([]string, error) {
   377  	term, ok := v.(string)
   378  	if !ok {
   379  		return nil, errors.Errorf("Hash tokenizer only supported for string types")
   380  	}
   381  	// Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function
   382  	// for doing checksum of content, because they have low collision ratios. See issue #2776.
   383  	hash := blake2b.Sum256([]byte(term))
   384  	if len(hash) == 0 {
   385  		return nil, errors.Errorf("Hash tokenizer failed to create hash")
   386  	}
   387  	return []string{string(hash[:])}, nil
   388  }
   389  func (t HashTokenizer) Identifier() byte { return IdentHash }
   390  func (t HashTokenizer) IsSortable() bool { return false }
   391  
   392  // IsLossy false for the HashTokenizer. This allows us to avoid having to retrieve values
   393  // for the returned results, and compare them against the value in the query, which is slow. There
   394  // is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality
   395  // query operations using the hash index.
   396  func (t HashTokenizer) IsLossy() bool { return false }
   397  
   398  // PluginTokenizer is implemented by external plugins loaded dynamically via
   399  // *.so files. It follows the implementation semantics of the Tokenizer
   400  // interface.
   401  //
   402  // Think carefully before modifying this interface, as it would break users' plugins.
   403  type PluginTokenizer interface {
   404  	Name() string
   405  	Type() string
   406  	Tokens(interface{}) ([]string, error)
   407  	Identifier() byte
   408  }
   409  
   410  // CustomTokenizer generates tokens from custom logic.
   411  // It doesn't make sense for plugins to implement the IsSortable and IsLossy methods,
   412  // so they're hard-coded.
   413  type CustomTokenizer struct{ PluginTokenizer }
   414  
   415  func (t CustomTokenizer) IsSortable() bool { return false }
   416  func (t CustomTokenizer) IsLossy() bool    { return true }
   417  
   418  func encodeInt(val int64) string {
   419  	buf := make([]byte, 9)
   420  	binary.BigEndian.PutUint64(buf[1:], uint64(val))
   421  	if val < 0 {
   422  		buf[0] = 0
   423  	} else {
   424  		buf[0] = 1
   425  	}
   426  	return string(buf)
   427  }
   428  
   429  func encodeToken(tok string, typ byte) string {
   430  	return string(typ) + tok
   431  }
   432  
   433  // EncodeGeoTokens encodes the given list of tokens as geo tokens.
   434  func EncodeGeoTokens(tokens []string) {
   435  	for i := 0; i < len(tokens); i++ {
   436  		tokens[i] = encodeToken(tokens[i], GeoTokenizer{}.Identifier())
   437  	}
   438  }
   439  
   440  // EncodeRegexTokens encodes the given list of strings as regex tokens.
   441  func EncodeRegexTokens(tokens []string) {
   442  	for i := 0; i < len(tokens); i++ {
   443  		tokens[i] = encodeToken(tokens[i], TrigramTokenizer{}.Identifier())
   444  	}
   445  }